diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2022-02-08 00:01:17 -0800 |
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2022-02-09 11:47:34 -0800 |
| commit | 7a888e8603f70351094b0725fba76c4111e72dac (patch) | |
| tree | b5d140ba212cf88a18a9ccdd9b65665ee24aae9d /src/Text | |
| parent | 7dc59aa26ae368e885d6c1070c8c576030b705d1 (diff) | |
Fix parsing of epub footnotes.
Closes #7884.
Diffstat (limited to 'src/Text')
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 81 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML/Types.hs | 1 |
2 files changed, 60 insertions, 22 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index d3b91c370..47b6af193 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -89,7 +89,7 @@ readHtml opts inp = do result <- flip runReaderT def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } - [] Nothing Set.empty [] M.empty opts) + [] Nothing Set.empty [] M.empty opts False) "source" tags case result of Right doc -> return doc @@ -106,8 +106,8 @@ stripPrefix x = x replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do - st <- getState - walkM (replaceNotes' (noteTable st)) bs + notes <- noteTable <$> getState + walkM (replaceNotes' notes) bs replaceNotes' :: PandocMonad m => [(Text, Blocks)] -> Inline -> TagParser m Inline @@ -179,6 +179,9 @@ block = ((do , "chapter" `T.isInfixOf` type' -> eSection _ | epubExts + , type' `elem` ["footnotes", "rearnotes"] + -> eFootnotes + _ | epubExts , type' `elem` ["footnote", "rearnote"] -> mempty <$ eFootnote _ | epubExts @@ -256,19 +259,39 @@ eCase = do Nothing -> Nothing <$ manyTill pAny (pSatisfy (matchTagClose "case")) eFootnote :: PandocMonad m => TagParser m () -eFootnote = try $ do - let notes = ["footnote", "rearnote"] +eFootnote = do guardEnabled Ext_epub_html_exts - (TagOpen tag attr') <- lookAhead pAny + TagOpen tag attr' <- lookAhead $ pSatisfy + (\case + TagOpen _ attr' + -> case lookup "type" attr' <|> lookup "epub:type" attr' of + Just "footnote" -> True + Just "rearnote" -> True + _ -> False + _ -> False) let attr = toStringAttr attr' - guard $ maybe False (`elem` notes) - (lookup "type" attr <|> lookup "epub:type" attr) let ident = fromMaybe "" (lookup "id" attr) content <- pInTags tag block - addNote ident content + updateState $ \s -> + s {noteTable = (ident, content) : noteTable s} -addNote :: PandocMonad m => Text -> Blocks -> TagParser m () -addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) +eFootnotes :: PandocMonad m => TagParser m Blocks +eFootnotes = try $ do + let notes = ["footnotes", "rearnotes"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr') <- lookAhead pAny + let attr = toStringAttr attr' + guard $ maybe False (`elem` notes) + (lookup "type" attr <|> lookup "epub:type" attr) + updateState $ \s -> s{ inFootnotes = True } + result <- pInTags tag block + updateState $ \s -> s{ inFootnotes = False } + if null result + -- if it just contains notes, we don't need the container: + then return result + -- but there might be content other than notes, in which case + -- we want a div: + else return $ B.divWith (toAttr attr') result eNoteref :: PandocMonad m => TagParser m Inlines eNoteref = try $ do @@ -337,6 +360,7 @@ parseTypeAttr _ = DefaultStyle pOrderedList :: PandocMonad m => TagParser m Blocks pOrderedList = try $ do TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) + isNoteList <- inFootnotes <$> getState let attribs = toStringAttr attribs' let start = fromMaybe 1 $ lookup "start" attribs >>= safeRead let style = fromMaybe DefaultStyle @@ -352,8 +376,14 @@ pOrderedList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pListItem nonItem) (pCloses "ol") - return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items + if isNoteList + then do + _ <- manyTill (eFootnote <|> pBlank) (pCloses "ol") + return mempty + else do + items <- manyTill (pListItem nonItem) (pCloses "ol") + return $ B.orderedListWith (start, style, DefaultDelim) $ + map (fixPlains True) items pDefinitionList :: PandocMonad m => TagParser m Blocks pDefinitionList = try $ do @@ -518,7 +548,10 @@ pHeader = try $ do pHrule :: PandocMonad m => TagParser m Blocks pHrule = do pSelfClosing (=="hr") (const True) - return B.horizontalRule + inNotes <- inFootnotes <$> getState + return $ if inNotes + then mempty + else B.horizontalRule pBlockQuote :: PandocMonad m => TagParser m Blocks pBlockQuote = do @@ -715,14 +748,18 @@ pLink = try $ do let title = fromAttrib "title" tag let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr' lab <- mconcat <$> manyTill inline (pCloses "a") - -- check for href; if href, then a link, otherwise a span - case maybeFromAttrib "href" tag of - Nothing -> - return $ extractSpaces (B.spanWith attr) lab - Just url' -> do - url <- canonicalizeUrl url' - return $ extractSpaces - (B.linkWith attr (escapeURI url) title) lab + st <- getState + if inFootnotes st && maybeFromAttrib "role" tag == Just "doc-backlink" + then return mempty + else do + -- check for href; if href, then a link, otherwise a span + case maybeFromAttrib "href" tag of + Nothing -> + return $ extractSpaces (B.spanWith attr) lab + Just url' -> do + url <- canonicalizeUrl url' + return $ extractSpaces + (B.linkWith attr (escapeURI url) title) lab pImage :: PandocMonad m => TagParser m Inlines pImage = do diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs index 97ad48c88..b70e81939 100644 --- a/src/Text/Pandoc/Readers/HTML/Types.hs +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -52,6 +52,7 @@ data HTMLState = HTMLState , logMessages :: [LogMessage] , macros :: Map Text Macro , readerOpts :: ReaderOptions + , inFootnotes :: Bool } -- | Local HTML parser state |
