diff options
| author | Julia Diaz <julia.diaz@gmail.com> | 2023-08-10 22:02:43 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-08-10 13:02:43 -0700 |
| commit | 6673f832ea6bb2498f588b78eb6fb8c099fb408a (patch) | |
| tree | aff547ab132cbc6cec63eb569de19f222e5961e1 /src | |
| parent | cbb33fefb758250e745d9795ce97cc91e293626a (diff) | |
Fix display of block elements in JATS reader (PR #8971)
A number of block elements, like disp-quote, list, and disp-formula, were always treated as inlines if appearing inside paragraphs, even if their usage granted a separate block. The function isElementBlock has been refined to prevent this, and a number of specific parse cases have been added to parseBlock.
Also, some minimal cleanup of the test file, in order for it to pass XML validation against the JATS DTD 1.3 (it was not compliant with the current or any previous versions of JATS).
Closes #8889.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Readers/JATS.hs | 103 |
1 files changed, 65 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index 538fb8264..5056eb3ff 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -15,7 +15,7 @@ Conversion of JATS XML to 'Pandoc' document. module Text.Pandoc.Readers.JATS ( readJATS ) where import Control.Monad.State.Strict ( StateT(runStateT), gets, modify ) -import Control.Monad (forM_, when, unless, MonadPlus(mplus)) +import Control.Monad (forM_, when, unless) import Control.Monad.Except (throwError) import Text.Pandoc.Error (PandocError(..)) import Data.Char (isDigit, isSpace) @@ -105,31 +105,30 @@ instance HasMeta JATSState where deleteMeta field s = s {jatsMeta = deleteMeta field (jatsMeta s)} isBlockElement :: Content -> Bool -isBlockElement (Elem e) = qName (elName e) `S.member` blocktags - where blocktags = S.fromList (paragraphLevel ++ lists ++ mathML ++ other) \\ S.fromList inlinetags - paragraphLevel = ["address", "array", "boxed-text", "chem-struct-wrap", - "code", "fig", "fig-group", "graphic", "media", "preformat", +isBlockElement (Elem e) = case qName (elName e) of + "disp-formula" -> if onlyOneChild e + then if hasFormulaChild e + then False + else case filterChild (named "alternatives") e of + Just a -> if hasFormulaChild a then False else True + Nothing -> True + else True + "alternatives" -> if hasFormulaChild e then False else True + _ -> qName (elName e) `S.member` blocktags + + where blocktags = S.fromList (paragraphLevel ++ lists ++ formulae ++ other) \\ S.fromList canBeInline + paragraphLevel = ["address", "answer", "answer-set", "array", "boxed-text", "chem-struct-wrap", + "code", "explanation", "fig", "fig-group", "graphic", "media", "preformat", "question", "question-wrap", "question-wrap-group", "supplementary-material", "table-wrap", "table-wrap-group", "alternatives", "disp-formula", "disp-formula-group"] lists = ["def-list", "list"] - mathML = ["tex-math", "mml:math"] + formulae = ["tex-math", "mml:math"] other = ["p", "related-article", "related-object", "ack", "disp-quote", "speech", "statement", "verse-group", "x"] - inlinetags = ["email", "ext-link", "uri", "inline-supplementary-material", - "related-article", "related-object", "hr", "bold", "fixed-case", - "italic", "monospace", "overline", "overline-start", "overline-end", - "roman", "sans-serif", "sc", "strike", "underline", "underline-start", - "underline-end", "ruby", "alternatives", "inline-graphic", "private-char", - "chem-struct", "inline-formula", "tex-math", "mml:math", "abbrev", - "milestone-end", "milestone-start", "named-content", "styled-content", - "fn", "target", "xref", "sub", "sup", "x", "address", "array", - "boxed-text", "chem-struct-wrap", "code", "fig", "fig-group", "graphic", - "media", "preformat", "supplementary-material", "table-wrap", - "table-wrap-group", "disp-formula", "disp-formula-group", - "citation-alternatives", "element-citation", "mixed-citation", - "nlm-citation", "award-id", "funding-source", "open-access", - "def-list", "list", "ack", "disp-quote", "speech", "statement", - "verse-group"] + canBeInline = ["tex-math", "mml:math", "related-object", "x"] + onlyOneChild x = length (allChildren x) == 1 + allChildren x = filterChildren (const True) x + isBlockElement _ = False -- Trim leading and trailing newline characters @@ -217,6 +216,13 @@ parseBlock (Elem e) = do else divWith (attrValue "id" e, ["caption"], []) <$> wrapWithHeader 6 (getBlocks e) "fn-group" -> parseFootnoteGroup "ref-list" -> parseRefList e + "alternatives" -> if hasFormulaChild e + then blockFormula displayMath e + else getBlocks e + "disp-formula" -> if hasFormulaChild e + then blockFormula displayMath e + else divWith (attrValue "id" e, ["disp-formula"], []) + <$> getBlocks e "?xml" -> return mempty _ -> getBlocks e where parseMixed container conts = do @@ -363,9 +369,7 @@ parseBlock (Elem e) = do wrapWithHeader n mBlocks = do isBook <- gets jatsBook let n' = if isBook || n == 0 then n + 1 else n - headerText <- case filterChild (named "title") e `mplus` - (filterChild (named "info") e >>= - filterChild (named "title")) of + headerText <- case filterChild (named "title") e of Just t -> getInlines t Nothing -> return mempty oldN <- gets jatsSectionLevel @@ -602,8 +606,11 @@ parseInline (Elem e) = let attr = (attrValue "id" e, [], []) return $ linkWith attr href title ils' - "disp-formula" -> formula displayMath - "inline-formula" -> formula math + "alternatives" -> if hasFormulaChild e + then inlineFormula math e + else innerInlines id + "disp-formula" -> inlineFormula displayMath e + "inline-formula" -> inlineFormula math e "math" | qURI (elName e) == Just "http://www.w3.org/1998/Math/MathML" -> return . math $ mathML e "tex-math" -> return . math $ textContent e @@ -616,11 +623,14 @@ parseInline (Elem e) = _ -> innerInlines id where innerInlines f = extractSpaces f . mconcat <$> mapM parseInline (elContent e) - mathML x = - case readMathML . showElement $ everywhere (mkT removePrefix) x of - Left _ -> mempty - Right m -> writeTeX m - formula constructor = do + codeWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + l -> [l] + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e + +inlineFormula :: PandocMonad m => (Text->Inlines) -> Element -> JATS m Inlines +inlineFormula constructor e = do let whereToLook = fromMaybe e $ filterElement (named "alternatives") e texMaths = map textContent $ filterChildren (named "tex-math") whereToLook @@ -628,12 +638,29 @@ parseInline (Elem e) = filterChildren isMathML whereToLook return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs - isMathML x = qName (elName x) == "math" && +blockFormula :: PandocMonad m => (Text->Inlines) -> Element -> JATS m Blocks +blockFormula constructor e = do + let whereToLook = fromMaybe e $ filterElement (named "alternatives") e + texMaths = map textContent $ + filterChildren (named "tex-math") whereToLook + mathMLs = map mathML $ + filterChildren isMathML whereToLook + return . para . head . take 1 . map constructor $ texMaths ++ mathMLs + +mathML :: Element -> Text +mathML x = + case readMathML . showElement $ everywhere (mkT removePrefix) x of + Left _ -> mempty + Right m -> writeTeX m + where removePrefix elname = elname { qPrefix = Nothing } + +isMathML :: Element -> Bool +isMathML x = qName (elName x) == "math" && qURI (elName x) == Just "http://www.w3.org/1998/Math/MathML" - removePrefix elname = elname { qPrefix = Nothing } - codeWithLang = do - let classes' = case attrValue "language" e of - "" -> [] - l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e + +formulaChildren :: Element -> [Element] +formulaChildren x = filterChildren isMathML x ++ filterChildren (named "tex-math") x + +hasFormulaChild :: Element -> Bool +hasFormulaChild x = length (formulaChildren x) > 0
\ No newline at end of file |
