summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJulia Diaz <julia.diaz@gmail.com>2023-08-10 22:02:43 +0200
committerGitHub <noreply@github.com>2023-08-10 13:02:43 -0700
commit6673f832ea6bb2498f588b78eb6fb8c099fb408a (patch)
treeaff547ab132cbc6cec63eb569de19f222e5961e1 /src
parentcbb33fefb758250e745d9795ce97cc91e293626a (diff)
Fix display of block elements in JATS reader (PR #8971)
A number of block elements, like disp-quote, list, and disp-formula, were always treated as inlines if appearing inside paragraphs, even if their usage granted a separate block. The function isElementBlock has been refined to prevent this, and a number of specific parse cases have been added to parseBlock. Also, some minimal cleanup of the test file, in order for it to pass XML validation against the JATS DTD 1.3 (it was not compliant with the current or any previous versions of JATS). Closes #8889.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers/JATS.hs103
1 files changed, 65 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index 538fb8264..5056eb3ff 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -15,7 +15,7 @@ Conversion of JATS XML to 'Pandoc' document.
module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict ( StateT(runStateT), gets, modify )
-import Control.Monad (forM_, when, unless, MonadPlus(mplus))
+import Control.Monad (forM_, when, unless)
import Control.Monad.Except (throwError)
import Text.Pandoc.Error (PandocError(..))
import Data.Char (isDigit, isSpace)
@@ -105,31 +105,30 @@ instance HasMeta JATSState where
deleteMeta field s = s {jatsMeta = deleteMeta field (jatsMeta s)}
isBlockElement :: Content -> Bool
-isBlockElement (Elem e) = qName (elName e) `S.member` blocktags
- where blocktags = S.fromList (paragraphLevel ++ lists ++ mathML ++ other) \\ S.fromList inlinetags
- paragraphLevel = ["address", "array", "boxed-text", "chem-struct-wrap",
- "code", "fig", "fig-group", "graphic", "media", "preformat",
+isBlockElement (Elem e) = case qName (elName e) of
+ "disp-formula" -> if onlyOneChild e
+ then if hasFormulaChild e
+ then False
+ else case filterChild (named "alternatives") e of
+ Just a -> if hasFormulaChild a then False else True
+ Nothing -> True
+ else True
+ "alternatives" -> if hasFormulaChild e then False else True
+ _ -> qName (elName e) `S.member` blocktags
+
+ where blocktags = S.fromList (paragraphLevel ++ lists ++ formulae ++ other) \\ S.fromList canBeInline
+ paragraphLevel = ["address", "answer", "answer-set", "array", "boxed-text", "chem-struct-wrap",
+ "code", "explanation", "fig", "fig-group", "graphic", "media", "preformat", "question", "question-wrap", "question-wrap-group",
"supplementary-material", "table-wrap", "table-wrap-group",
"alternatives", "disp-formula", "disp-formula-group"]
lists = ["def-list", "list"]
- mathML = ["tex-math", "mml:math"]
+ formulae = ["tex-math", "mml:math"]
other = ["p", "related-article", "related-object", "ack", "disp-quote",
"speech", "statement", "verse-group", "x"]
- inlinetags = ["email", "ext-link", "uri", "inline-supplementary-material",
- "related-article", "related-object", "hr", "bold", "fixed-case",
- "italic", "monospace", "overline", "overline-start", "overline-end",
- "roman", "sans-serif", "sc", "strike", "underline", "underline-start",
- "underline-end", "ruby", "alternatives", "inline-graphic", "private-char",
- "chem-struct", "inline-formula", "tex-math", "mml:math", "abbrev",
- "milestone-end", "milestone-start", "named-content", "styled-content",
- "fn", "target", "xref", "sub", "sup", "x", "address", "array",
- "boxed-text", "chem-struct-wrap", "code", "fig", "fig-group", "graphic",
- "media", "preformat", "supplementary-material", "table-wrap",
- "table-wrap-group", "disp-formula", "disp-formula-group",
- "citation-alternatives", "element-citation", "mixed-citation",
- "nlm-citation", "award-id", "funding-source", "open-access",
- "def-list", "list", "ack", "disp-quote", "speech", "statement",
- "verse-group"]
+ canBeInline = ["tex-math", "mml:math", "related-object", "x"]
+ onlyOneChild x = length (allChildren x) == 1
+ allChildren x = filterChildren (const True) x
+
isBlockElement _ = False
-- Trim leading and trailing newline characters
@@ -217,6 +216,13 @@ parseBlock (Elem e) = do
else divWith (attrValue "id" e, ["caption"], []) <$> wrapWithHeader 6 (getBlocks e)
"fn-group" -> parseFootnoteGroup
"ref-list" -> parseRefList e
+ "alternatives" -> if hasFormulaChild e
+ then blockFormula displayMath e
+ else getBlocks e
+ "disp-formula" -> if hasFormulaChild e
+ then blockFormula displayMath e
+ else divWith (attrValue "id" e, ["disp-formula"], [])
+ <$> getBlocks e
"?xml" -> return mempty
_ -> getBlocks e
where parseMixed container conts = do
@@ -363,9 +369,7 @@ parseBlock (Elem e) = do
wrapWithHeader n mBlocks = do
isBook <- gets jatsBook
let n' = if isBook || n == 0 then n + 1 else n
- headerText <- case filterChild (named "title") e `mplus`
- (filterChild (named "info") e >>=
- filterChild (named "title")) of
+ headerText <- case filterChild (named "title") e of
Just t -> getInlines t
Nothing -> return mempty
oldN <- gets jatsSectionLevel
@@ -602,8 +606,11 @@ parseInline (Elem e) =
let attr = (attrValue "id" e, [], [])
return $ linkWith attr href title ils'
- "disp-formula" -> formula displayMath
- "inline-formula" -> formula math
+ "alternatives" -> if hasFormulaChild e
+ then inlineFormula math e
+ else innerInlines id
+ "disp-formula" -> inlineFormula displayMath e
+ "inline-formula" -> inlineFormula math e
"math" | qURI (elName e) == Just "http://www.w3.org/1998/Math/MathML"
-> return . math $ mathML e
"tex-math" -> return . math $ textContent e
@@ -616,11 +623,14 @@ parseInline (Elem e) =
_ -> innerInlines id
where innerInlines f = extractSpaces f . mconcat <$>
mapM parseInline (elContent e)
- mathML x =
- case readMathML . showElement $ everywhere (mkT removePrefix) x of
- Left _ -> mempty
- Right m -> writeTeX m
- formula constructor = do
+ codeWithLang = do
+ let classes' = case attrValue "language" e of
+ "" -> []
+ l -> [l]
+ return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e
+
+inlineFormula :: PandocMonad m => (Text->Inlines) -> Element -> JATS m Inlines
+inlineFormula constructor e = do
let whereToLook = fromMaybe e $ filterElement (named "alternatives") e
texMaths = map textContent $
filterChildren (named "tex-math") whereToLook
@@ -628,12 +638,29 @@ parseInline (Elem e) =
filterChildren isMathML whereToLook
return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs
- isMathML x = qName (elName x) == "math" &&
+blockFormula :: PandocMonad m => (Text->Inlines) -> Element -> JATS m Blocks
+blockFormula constructor e = do
+ let whereToLook = fromMaybe e $ filterElement (named "alternatives") e
+ texMaths = map textContent $
+ filterChildren (named "tex-math") whereToLook
+ mathMLs = map mathML $
+ filterChildren isMathML whereToLook
+ return . para . head . take 1 . map constructor $ texMaths ++ mathMLs
+
+mathML :: Element -> Text
+mathML x =
+ case readMathML . showElement $ everywhere (mkT removePrefix) x of
+ Left _ -> mempty
+ Right m -> writeTeX m
+ where removePrefix elname = elname { qPrefix = Nothing }
+
+isMathML :: Element -> Bool
+isMathML x = qName (elName x) == "math" &&
qURI (elName x) ==
Just "http://www.w3.org/1998/Math/MathML"
- removePrefix elname = elname { qPrefix = Nothing }
- codeWithLang = do
- let classes' = case attrValue "language" e of
- "" -> []
- l -> [l]
- return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e
+
+formulaChildren :: Element -> [Element]
+formulaChildren x = filterChildren isMathML x ++ filterChildren (named "tex-math") x
+
+hasFormulaChild :: Element -> Bool
+hasFormulaChild x = length (formulaChildren x) > 0 \ No newline at end of file