diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2022-10-18 10:09:27 -0700 |
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2022-10-18 10:09:27 -0700 |
| commit | 20492d523c8324e36781cfbbc8092c796f94b151 (patch) | |
| tree | 13a31b738ec90f447d0c0c03dbd6296c44d28302 /src/Text | |
| parent | e5fbddd3b6c0c7a3b76b313edbe55242e3b138fc (diff) | |
T.P.Parsing: export `registerIdentifier`.
[API change]
Use this in the HTML reader to register identifiers to avoid
duplicates created by `auto_identifiers`.
Diffstat (limited to 'src/Text')
| -rw-r--r-- | src/Text/Pandoc/Parsing.hs | 2 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/General.hs | 15 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 39 |
3 files changed, 31 insertions, 25 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 501e8c74b..85918ff11 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -96,6 +96,7 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources, Key (..), toKey, registerHeader, + registerIdentifier, smartPunctuation, singleQuoteStart, singleQuoteEnd, @@ -288,6 +289,7 @@ import Text.Pandoc.Parsing.General parseFromString', readWith, registerHeader, + registerIdentifier, sepBy1', skipSpaces, spaceChar, diff --git a/src/Text/Pandoc/Parsing/General.hs b/src/Text/Pandoc/Parsing/General.hs index e425c7408..ae892addb 100644 --- a/src/Text/Pandoc/Parsing/General.hs +++ b/src/Text/Pandoc/Parsing/General.hs @@ -47,6 +47,7 @@ module Text.Pandoc.Parsing.General , readWith , readWithM , registerHeader + , registerIdentifier , sepBy1' , skipSpaces , spaceChar @@ -670,8 +671,8 @@ registerHeader (ident,classes,kvs) header' = do let id'' = if Ext_ascii_identifiers `extensionEnabled` exts then toAsciiText id' else id' - updateState $ updateIdentifierList $ Set.insert id' - updateState $ updateIdentifierList $ Set.insert id'' + registerIdentifier id' + when (id'' /= id') $ registerIdentifier id'' return (id'',classes,kvs) else do unless (T.null ident) $ do @@ -681,6 +682,16 @@ registerHeader (ident,classes,kvs) header' = do updateState $ updateIdentifierList $ Set.insert ident return (ident,classes,kvs) +-- | Update list of identifiers in state to prevent auto_identifiers +-- from duplicating existing identifiers. +registerIdentifier :: (Stream s m a, HasIdentifierList st) + => Text -> ParsecT s st m () +registerIdentifier ident + | T.null ident = return () + | otherwise = unless (T.null ident) $ + updateState $ updateIdentifierList $ Set.insert ident + + -- This is used to prevent exponential blowups for things like: -- a**a*a**a*a**a*a**a*a**a*a**a*a** nested :: Stream s m a diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index f44859ef8..d56d846bb 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -292,8 +292,8 @@ eFootnotes = try $ do -- but there might be content other than notes, in which case -- we want a div: else do - let attr'' =toAttr attr' - updateIdentifiers attr'' + let attr''@(ident,_,_) = toAttr attr' + registerIdentifier ident return $ B.divWith attr'' result eNoteref :: PandocMonad m => TagParser m Inlines @@ -459,7 +459,7 @@ pDiv = try $ do guardEnabled Ext_native_divs TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) let (ident, classes, kvs) = toAttr attr' - updateIdentifiers (ident, classes, kvs) + registerIdentifier ident contents <- pInTags tag block let classes' = if tag == "section" then "section":classes @@ -608,7 +608,7 @@ pCodeBlock = try $ do TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) -- if the `pre` has no attributes, try if it is followed by a `code` -- element and use those attributes if possible. - attr <- case attr' of + attr@(ident,_,_) <- case attr' of _:_ -> pure (toAttr attr') [] -> option nullAttr $ do TagOpen _ codeAttr <- pSatisfy (matchTagOpen "code" []) @@ -618,7 +618,7 @@ pCodeBlock = try $ do , let v' = if k == "class" then fromMaybe v (T.stripPrefix "language-" v) else v ] - updateIdentifiers attr + registerIdentifier ident contents <- manyTill pAny (pCloses "pre" <|> eof) let rawText = T.concat $ map tagToText contents -- drop leading newline if any @@ -731,7 +731,7 @@ pSpanLike = parseTag tagName = do TagOpen _ attrs <- pSatisfy $ tagOpenLit tagName (const True) let (ids, cs, kvs) = toAttr attrs - updateIdentifiers (ids, cs, kvs) + registerIdentifier ids content <- mconcat <$> manyTill inline (pCloses tagName <|> eof) return $ B.spanWith (ids, tagName : cs, kvs) content @@ -759,13 +759,13 @@ pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag@(TagOpen _ attr') <- pSatisfy $ tagOpenLit "a" (const True) let title = fromAttrib "title" tag - let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr' + let attr@(ident,_,_) = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr' lab <- mconcat <$> manyTill inline (pCloses "a") st <- getState if inFootnotes st && maybeFromAttrib "role" tag == Just "doc-backlink" then return mempty else do - updateIdentifiers attr + registerIdentifier ident -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of Nothing -> @@ -781,8 +781,9 @@ pImage = do url <- canonicalizeUrl $ fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - let attr = toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr' - updateIdentifiers attr + let attr@(ident,_,_) = + toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr' + registerIdentifier ident return $ B.imageWith attr (escapeURI url) title (B.text alt) pSvg :: PandocMonad m => TagParser m Inlines @@ -791,7 +792,7 @@ pSvg = do -- if raw_html enabled, parse svg tag as raw opent@(TagOpen _ attr') <- pSatisfy (matchTagOpen "svg" []) let (ident,cls,_) = toAttr attr' - updateIdentifiers (ident,cls,[]) + registerIdentifier ident contents <- many (notFollowedBy (pCloses "svg") >> pAny) closet <- TagClose "svg" <$ (pCloses "svg" <|> eof) let rawText = T.strip $ renderTags' (opent : contents ++ [closet]) @@ -812,9 +813,9 @@ pCode = try $ do code open attr code :: PandocMonad m => Text -> Attr -> TagParser m Inlines -code open attr = do +code open attr@(ident,_,_) = do result <- mconcat <$> manyTill inline (pCloses open) - updateIdentifiers attr + registerIdentifier ident return $ formatCode attr result -- https://developer.mozilla.org/en-US/docs/Web/HTML/Element/bdo @@ -833,8 +834,8 @@ pSpan :: PandocMonad m => TagParser m Inlines pSpan = try $ do guardEnabled Ext_native_spans TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) - let attr = toAttr attr' - updateIdentifiers attr + let attr@(ident,_,_) = toAttr attr' + registerIdentifier ident contents <- pInTags "span" inline let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes where styleAttr = fromMaybe "" $ lookup "style" attr' @@ -1164,11 +1165,3 @@ canonicalizeUrl url = do return $ case (parseURIReference (T.unpack url), mbBaseHref) of (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) _ -> url - --- | Update list of identifiers in state to prevent auto_identifiers --- from duplicating existing identifiers. -updateIdentifiers :: PandocMonad m => Attr -> TagParser m () -updateIdentifiers (ident,_,_) - | T.null ident = return () - | otherwise = unless (T.null ident) $ - updateState $ updateIdentifierList $ Set.insert ident |
