summaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2022-02-08 00:01:17 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2022-02-09 11:47:34 -0800
commit7a888e8603f70351094b0725fba76c4111e72dac (patch)
treeb5d140ba212cf88a18a9ccdd9b65665ee24aae9d /src/Text
parent7dc59aa26ae368e885d6c1070c8c576030b705d1 (diff)
Fix parsing of epub footnotes.
Closes #7884.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs81
-rw-r--r--src/Text/Pandoc/Readers/HTML/Types.hs1
2 files changed, 60 insertions, 22 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index d3b91c370..47b6af193 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -89,7 +89,7 @@ readHtml opts inp = do
result <- flip runReaderT def $
runParserT parseDoc
(HTMLState def{ stateOptions = opts }
- [] Nothing Set.empty [] M.empty opts)
+ [] Nothing Set.empty [] M.empty opts False)
"source" tags
case result of
Right doc -> return doc
@@ -106,8 +106,8 @@ stripPrefix x = x
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
- st <- getState
- walkM (replaceNotes' (noteTable st)) bs
+ notes <- noteTable <$> getState
+ walkM (replaceNotes' notes) bs
replaceNotes' :: PandocMonad m
=> [(Text, Blocks)] -> Inline -> TagParser m Inline
@@ -179,6 +179,9 @@ block = ((do
, "chapter" `T.isInfixOf` type'
-> eSection
_ | epubExts
+ , type' `elem` ["footnotes", "rearnotes"]
+ -> eFootnotes
+ _ | epubExts
, type' `elem` ["footnote", "rearnote"]
-> mempty <$ eFootnote
_ | epubExts
@@ -256,19 +259,39 @@ eCase = do
Nothing -> Nothing <$ manyTill pAny (pSatisfy (matchTagClose "case"))
eFootnote :: PandocMonad m => TagParser m ()
-eFootnote = try $ do
- let notes = ["footnote", "rearnote"]
+eFootnote = do
guardEnabled Ext_epub_html_exts
- (TagOpen tag attr') <- lookAhead pAny
+ TagOpen tag attr' <- lookAhead $ pSatisfy
+ (\case
+ TagOpen _ attr'
+ -> case lookup "type" attr' <|> lookup "epub:type" attr' of
+ Just "footnote" -> True
+ Just "rearnote" -> True
+ _ -> False
+ _ -> False)
let attr = toStringAttr attr'
- guard $ maybe False (`elem` notes)
- (lookup "type" attr <|> lookup "epub:type" attr)
let ident = fromMaybe "" (lookup "id" attr)
content <- pInTags tag block
- addNote ident content
+ updateState $ \s ->
+ s {noteTable = (ident, content) : noteTable s}
-addNote :: PandocMonad m => Text -> Blocks -> TagParser m ()
-addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s})
+eFootnotes :: PandocMonad m => TagParser m Blocks
+eFootnotes = try $ do
+ let notes = ["footnotes", "rearnotes"]
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr') <- lookAhead pAny
+ let attr = toStringAttr attr'
+ guard $ maybe False (`elem` notes)
+ (lookup "type" attr <|> lookup "epub:type" attr)
+ updateState $ \s -> s{ inFootnotes = True }
+ result <- pInTags tag block
+ updateState $ \s -> s{ inFootnotes = False }
+ if null result
+ -- if it just contains notes, we don't need the container:
+ then return result
+ -- but there might be content other than notes, in which case
+ -- we want a div:
+ else return $ B.divWith (toAttr attr') result
eNoteref :: PandocMonad m => TagParser m Inlines
eNoteref = try $ do
@@ -337,6 +360,7 @@ parseTypeAttr _ = DefaultStyle
pOrderedList :: PandocMonad m => TagParser m Blocks
pOrderedList = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" [])
+ isNoteList <- inFootnotes <$> getState
let attribs = toStringAttr attribs'
let start = fromMaybe 1 $ lookup "start" attribs >>= safeRead
let style = fromMaybe DefaultStyle
@@ -352,8 +376,14 @@ pOrderedList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pListItem nonItem) (pCloses "ol")
- return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
+ if isNoteList
+ then do
+ _ <- manyTill (eFootnote <|> pBlank) (pCloses "ol")
+ return mempty
+ else do
+ items <- manyTill (pListItem nonItem) (pCloses "ol")
+ return $ B.orderedListWith (start, style, DefaultDelim) $
+ map (fixPlains True) items
pDefinitionList :: PandocMonad m => TagParser m Blocks
pDefinitionList = try $ do
@@ -518,7 +548,10 @@ pHeader = try $ do
pHrule :: PandocMonad m => TagParser m Blocks
pHrule = do
pSelfClosing (=="hr") (const True)
- return B.horizontalRule
+ inNotes <- inFootnotes <$> getState
+ return $ if inNotes
+ then mempty
+ else B.horizontalRule
pBlockQuote :: PandocMonad m => TagParser m Blocks
pBlockQuote = do
@@ -715,14 +748,18 @@ pLink = try $ do
let title = fromAttrib "title" tag
let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr'
lab <- mconcat <$> manyTill inline (pCloses "a")
- -- check for href; if href, then a link, otherwise a span
- case maybeFromAttrib "href" tag of
- Nothing ->
- return $ extractSpaces (B.spanWith attr) lab
- Just url' -> do
- url <- canonicalizeUrl url'
- return $ extractSpaces
- (B.linkWith attr (escapeURI url) title) lab
+ st <- getState
+ if inFootnotes st && maybeFromAttrib "role" tag == Just "doc-backlink"
+ then return mempty
+ else do
+ -- check for href; if href, then a link, otherwise a span
+ case maybeFromAttrib "href" tag of
+ Nothing ->
+ return $ extractSpaces (B.spanWith attr) lab
+ Just url' -> do
+ url <- canonicalizeUrl url'
+ return $ extractSpaces
+ (B.linkWith attr (escapeURI url) title) lab
pImage :: PandocMonad m => TagParser m Inlines
pImage = do
diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs
index 97ad48c88..b70e81939 100644
--- a/src/Text/Pandoc/Readers/HTML/Types.hs
+++ b/src/Text/Pandoc/Readers/HTML/Types.hs
@@ -52,6 +52,7 @@ data HTMLState = HTMLState
, logMessages :: [LogMessage]
, macros :: Map Text Macro
, readerOpts :: ReaderOptions
+ , inFootnotes :: Bool
}
-- | Local HTML parser state