summaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2022-10-18 10:09:27 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2022-10-18 10:09:27 -0700
commit20492d523c8324e36781cfbbc8092c796f94b151 (patch)
tree13a31b738ec90f447d0c0c03dbd6296c44d28302 /src/Text
parente5fbddd3b6c0c7a3b76b313edbe55242e3b138fc (diff)
T.P.Parsing: export `registerIdentifier`.
[API change] Use this in the HTML reader to register identifiers to avoid duplicates created by `auto_identifiers`.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Parsing.hs2
-rw-r--r--src/Text/Pandoc/Parsing/General.hs15
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs39
3 files changed, 31 insertions, 25 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 501e8c74b..85918ff11 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -96,6 +96,7 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
Key (..),
toKey,
registerHeader,
+ registerIdentifier,
smartPunctuation,
singleQuoteStart,
singleQuoteEnd,
@@ -288,6 +289,7 @@ import Text.Pandoc.Parsing.General
parseFromString',
readWith,
registerHeader,
+ registerIdentifier,
sepBy1',
skipSpaces,
spaceChar,
diff --git a/src/Text/Pandoc/Parsing/General.hs b/src/Text/Pandoc/Parsing/General.hs
index e425c7408..ae892addb 100644
--- a/src/Text/Pandoc/Parsing/General.hs
+++ b/src/Text/Pandoc/Parsing/General.hs
@@ -47,6 +47,7 @@ module Text.Pandoc.Parsing.General
, readWith
, readWithM
, registerHeader
+ , registerIdentifier
, sepBy1'
, skipSpaces
, spaceChar
@@ -670,8 +671,8 @@ registerHeader (ident,classes,kvs) header' = do
let id'' = if Ext_ascii_identifiers `extensionEnabled` exts
then toAsciiText id'
else id'
- updateState $ updateIdentifierList $ Set.insert id'
- updateState $ updateIdentifierList $ Set.insert id''
+ registerIdentifier id'
+ when (id'' /= id') $ registerIdentifier id''
return (id'',classes,kvs)
else do
unless (T.null ident) $ do
@@ -681,6 +682,16 @@ registerHeader (ident,classes,kvs) header' = do
updateState $ updateIdentifierList $ Set.insert ident
return (ident,classes,kvs)
+-- | Update list of identifiers in state to prevent auto_identifiers
+-- from duplicating existing identifiers.
+registerIdentifier :: (Stream s m a, HasIdentifierList st)
+ => Text -> ParsecT s st m ()
+registerIdentifier ident
+ | T.null ident = return ()
+ | otherwise = unless (T.null ident) $
+ updateState $ updateIdentifierList $ Set.insert ident
+
+
-- This is used to prevent exponential blowups for things like:
-- a**a*a**a*a**a*a**a*a**a*a**a*a**
nested :: Stream s m a
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f44859ef8..d56d846bb 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -292,8 +292,8 @@ eFootnotes = try $ do
-- but there might be content other than notes, in which case
-- we want a div:
else do
- let attr'' =toAttr attr'
- updateIdentifiers attr''
+ let attr''@(ident,_,_) = toAttr attr'
+ registerIdentifier ident
return $ B.divWith attr'' result
eNoteref :: PandocMonad m => TagParser m Inlines
@@ -459,7 +459,7 @@ pDiv = try $ do
guardEnabled Ext_native_divs
TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
let (ident, classes, kvs) = toAttr attr'
- updateIdentifiers (ident, classes, kvs)
+ registerIdentifier ident
contents <- pInTags tag block
let classes' = if tag == "section"
then "section":classes
@@ -608,7 +608,7 @@ pCodeBlock = try $ do
TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
-- if the `pre` has no attributes, try if it is followed by a `code`
-- element and use those attributes if possible.
- attr <- case attr' of
+ attr@(ident,_,_) <- case attr' of
_:_ -> pure (toAttr attr')
[] -> option nullAttr $ do
TagOpen _ codeAttr <- pSatisfy (matchTagOpen "code" [])
@@ -618,7 +618,7 @@ pCodeBlock = try $ do
, let v' = if k == "class"
then fromMaybe v (T.stripPrefix "language-" v)
else v ]
- updateIdentifiers attr
+ registerIdentifier ident
contents <- manyTill pAny (pCloses "pre" <|> eof)
let rawText = T.concat $ map tagToText contents
-- drop leading newline if any
@@ -731,7 +731,7 @@ pSpanLike =
parseTag tagName = do
TagOpen _ attrs <- pSatisfy $ tagOpenLit tagName (const True)
let (ids, cs, kvs) = toAttr attrs
- updateIdentifiers (ids, cs, kvs)
+ registerIdentifier ids
content <- mconcat <$> manyTill inline (pCloses tagName <|> eof)
return $ B.spanWith (ids, tagName : cs, kvs) content
@@ -759,13 +759,13 @@ pLink :: PandocMonad m => TagParser m Inlines
pLink = try $ do
tag@(TagOpen _ attr') <- pSatisfy $ tagOpenLit "a" (const True)
let title = fromAttrib "title" tag
- let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr'
+ let attr@(ident,_,_) = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr'
lab <- mconcat <$> manyTill inline (pCloses "a")
st <- getState
if inFootnotes st && maybeFromAttrib "role" tag == Just "doc-backlink"
then return mempty
else do
- updateIdentifiers attr
+ registerIdentifier ident
-- check for href; if href, then a link, otherwise a span
case maybeFromAttrib "href" tag of
Nothing ->
@@ -781,8 +781,9 @@ pImage = do
url <- canonicalizeUrl $ fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
- let attr = toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr'
- updateIdentifiers attr
+ let attr@(ident,_,_) =
+ toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr'
+ registerIdentifier ident
return $ B.imageWith attr (escapeURI url) title (B.text alt)
pSvg :: PandocMonad m => TagParser m Inlines
@@ -791,7 +792,7 @@ pSvg = do
-- if raw_html enabled, parse svg tag as raw
opent@(TagOpen _ attr') <- pSatisfy (matchTagOpen "svg" [])
let (ident,cls,_) = toAttr attr'
- updateIdentifiers (ident,cls,[])
+ registerIdentifier ident
contents <- many (notFollowedBy (pCloses "svg") >> pAny)
closet <- TagClose "svg" <$ (pCloses "svg" <|> eof)
let rawText = T.strip $ renderTags' (opent : contents ++ [closet])
@@ -812,9 +813,9 @@ pCode = try $ do
code open attr
code :: PandocMonad m => Text -> Attr -> TagParser m Inlines
-code open attr = do
+code open attr@(ident,_,_) = do
result <- mconcat <$> manyTill inline (pCloses open)
- updateIdentifiers attr
+ registerIdentifier ident
return $ formatCode attr result
-- https://developer.mozilla.org/en-US/docs/Web/HTML/Element/bdo
@@ -833,8 +834,8 @@ pSpan :: PandocMonad m => TagParser m Inlines
pSpan = try $ do
guardEnabled Ext_native_spans
TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
- let attr = toAttr attr'
- updateIdentifiers attr
+ let attr@(ident,_,_) = toAttr attr'
+ registerIdentifier ident
contents <- pInTags "span" inline
let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes
where styleAttr = fromMaybe "" $ lookup "style" attr'
@@ -1164,11 +1165,3 @@ canonicalizeUrl url = do
return $ case (parseURIReference (T.unpack url), mbBaseHref) of
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url
-
--- | Update list of identifiers in state to prevent auto_identifiers
--- from duplicating existing identifiers.
-updateIdentifiers :: PandocMonad m => Attr -> TagParser m ()
-updateIdentifiers (ident,_,_)
- | T.null ident = return ()
- | otherwise = unless (T.null ident) $
- updateState $ updateIdentifierList $ Set.insert ident