diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2022-10-29 11:18:27 -0700 |
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2022-10-29 11:46:03 -0700 |
| commit | 512cf5a908872c9b3332718aa3272ad09914da16 (patch) | |
| tree | ccd7ca935a94ddaf95c2af92fa7a5a6188cd1909 /src | |
| parent | 82072f588c9f45024b0a13ede0c058e30469b3be (diff) | |
T.P.Parsing.General: change `characterReference`, `charsInBalanced`.
`characterReference` now returns a Text (as it should, because some
named references don't correspond to a single Char), and uses
the `lookupEntity` function from commonmark-hs instead of the slow
one from tagsoup.
`charsInBalanced` now takes a Text parser rather than a Char parser
as argument.
[API change]
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Parsing/Citations.hs | 3 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/General.hs | 31 | ||||
| -rw-r--r-- | src/Text/Pandoc/Parsing/Smart.hs | 9 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/DokuWiki.hs | 2 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 23 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Org/Inlines.hs | 4 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Roff.hs | 2 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/TWiki.hs | 2 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 2 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/TikiWiki.hs | 2 |
10 files changed, 37 insertions, 43 deletions
diff --git a/src/Text/Pandoc/Parsing/Citations.hs b/src/Text/Pandoc/Parsing/Citations.hs index a086951aa..f3c62c03c 100644 --- a/src/Text/Pandoc/Parsing/Citations.hs +++ b/src/Text/Pandoc/Parsing/Citations.hs @@ -41,7 +41,8 @@ citeKey allowBraced = try $ do char '@' key <- simpleCiteIdentifier <|> if allowBraced - then charsInBalanced '{' '}' (satisfy (not . isSpace)) + then charsInBalanced '{' '}' + (T.singleton <$> (satisfy (not . isSpace))) else mzero return (suppress_author, key) diff --git a/src/Text/Pandoc/Parsing/General.hs b/src/Text/Pandoc/Parsing/General.hs index 397a3e94f..e7f8c16b3 100644 --- a/src/Text/Pandoc/Parsing/General.hs +++ b/src/Text/Pandoc/Parsing/General.hs @@ -85,7 +85,7 @@ import Data.List (intercalate, sortOn) import Data.Ord (Down(..)) import Data.Maybe (fromMaybe) import Data.Text (Text) -import Text.HTML.TagSoup.Entity (lookupEntity) +import Commonmark.Entity (lookupEntity) import Text.Pandoc.Asciify (toAsciiText) import Text.Pandoc.Builder (Attr, Inline(Str), Inlines, trimInlines) import Text.Pandoc.Class.PandocMonad (PandocMonad, readFileFromDirs, report) @@ -449,21 +449,16 @@ lineClump = blanklines -- pairs of open and close, which must be different. For example, -- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" -- and return "hello (there)". -charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) => Char -> Char -> ParsecT s st m Char - -> ParsecT s st m Text +charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) + => Char -> Char -> ParsecT s st m Text -> ParsecT s st m Text charsInBalanced open close parser = try $ do char open let isDelim c = c == open || c == close - raw <- many $ T.pack <$> many1 (notFollowedBy (satisfy isDelim) >> parser) + raw <- many $ mconcat <$> many1 (notFollowedBy (satisfy isDelim) >> parser) <|> (do res <- charsInBalanced open close parser return $ T.singleton open <> res <> T.singleton close) char close - return $ T.concat raw - --- old charsInBalanced would be: --- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline) --- old charsInBalanced' would be: --- charsInBalanced open close anyChar + return $ mconcat raw -- Parsers for email addresses and URIs @@ -530,7 +525,7 @@ uri = try $ do wordChar = satisfy isWordChar percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit - entity = try $ pure <$> characterReference + entity = try $ T.unpack <$> characterReference punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) uriChunk = many1 wordChar <|> percentEscaped @@ -585,21 +580,17 @@ escaped :: (Stream s m Char, UpdateSourcePos s Char) escaped parser = try $ char '\\' >> parser -- | Parse character entity. -characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s st m Char +characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s st m Text characterReference = try $ do char '&' - ent <- many1Till nonspaceChar (char ';') - let ent' = case ent of - '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug - '#':_ -> ent - _ -> ent ++ ";" - case lookupEntity ent' of - Just (c : _) -> return c + ent <- many1TillChar nonspaceChar (char ';') + case lookupEntity (ent <> ";") of + Just t -> return t _ -> Prelude.fail "entity not found" -- | Parses a character reference and returns a Str element. charRef :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s st m Inline -charRef = Str . T.singleton <$> characterReference +charRef = Str <$> characterReference lineBlockLine :: Monad m => ParsecT Sources st m Text lineBlockLine = try $ do diff --git a/src/Text/Pandoc/Parsing/Smart.hs b/src/Text/Pandoc/Parsing/Smart.hs index fdc824e2c..e9fe9f7a7 100644 --- a/src/Text/Pandoc/Parsing/Smart.hs +++ b/src/Text/Pandoc/Parsing/Smart.hs @@ -43,7 +43,7 @@ import Text.Parsec , notFollowedBy , try ) - +import qualified Data.Text as T import qualified Text.Pandoc.Builder as B -- | Parses various ASCII punctuation, quotes, and apostrophe in a smart @@ -95,9 +95,10 @@ doubleQuoted inlineParser = do charOrRef :: (Stream s m Char, UpdateSourcePos s Char) => [Char] -> ParsecT s st m Char charOrRef cs = - oneOf cs <|> try (do c <- characterReference - guard (c `elem` cs) - return c) + oneOf cs <|> try (do t <- characterReference + case T.unpack t of + [c] | c `elem` cs -> return c + _ -> fail "unexpected character reference") -- | Succeeds if the parser is -- diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs index 3d52cd747..1ca4cf696 100644 --- a/src/Text/Pandoc/Readers/DokuWiki.hs +++ b/src/Text/Pandoc/Readers/DokuWiki.hs @@ -244,7 +244,7 @@ nocache :: PandocMonad m => DWParser m B.Inlines nocache = try $ mempty <$ string "~~NOCACHE~~" str :: PandocMonad m => DWParser m B.Inlines -str = B.str <$> (many1Char alphaNum <|> countChar 1 characterReference) +str = B.str <$> (many1Char alphaNum <|> characterReference) symbol :: PandocMonad m => DWParser m B.Inlines symbol = B.str <$> countChar 1 nonspaceChar diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index d7517f24c..42df8b985 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -189,11 +189,11 @@ skipNonindentSpaces = do tabStop <- getOption readerTabStop gobbleAtMostSpaces (tabStop - 1) <* notFollowedBy spaceChar -litChar :: PandocMonad m => MarkdownParser m Char -litChar = escapedChar' +litChar :: PandocMonad m => MarkdownParser m Text +litChar = T.singleton <$> escapedChar' <|> characterReference - <|> noneOf "\n" - <|> try (newline >> notFollowedBy blankline >> return ' ') + <|> T.singleton <$> noneOf "\n" + <|> try (newline >> notFollowedBy blankline >> return " ") -- | Parse a sequence of inline elements between square brackets, -- including inlines between balanced pairs of square brackets. @@ -356,8 +356,9 @@ referenceKey = try $ do notFollowedBy' $ guardEnabled Ext_mmd_link_attributes >> try (spnl <* keyValAttr) notFollowedBy' (() <$ reference) - many1Char $ notFollowedBy space >> litChar - let betweenAngles = try $ char '<' >> manyTillChar litChar (char '>') + mconcat <$> many1 (notFollowedBy space *> litChar) + let betweenAngles = try $ char '<' >> + mconcat <$> (manyTill litChar (char '>')) rebase <- option False (True <$ guardEnabled Ext_rebase_relative_paths) src <- (if rebase then rebasePath pos else id) <$> (try betweenAngles <|> sourceURL) @@ -395,7 +396,7 @@ quotedTitle c = try $ do char c notFollowedBy spaces let pEnder = try $ char c >> notFollowedBy (satisfy isAlphaNum) - let regChunk = many1Char (noneOf ['\\','\n','&',c]) <|> countChar 1 litChar + let regChunk = many1Char (noneOf ['\\','\n','&',c]) <|> litChar let nestedChunk = (\x -> T.singleton c <> x <> T.singleton c) <$> quotedTitle c T.unwords . T.words . T.concat <$> manyTill (nestedChunk <|> regChunk) pEnder @@ -653,8 +654,8 @@ keyValAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) keyValAttr = try $ do key <- identifier char '=' - val <- T.pack <$> enclosed (char '"') (char '"') litChar - <|> T.pack <$> enclosed (char '\'') (char '\'') litChar + val <- mconcat <$> enclosed (char '"') (char '"') litChar + <|> mconcat <$> enclosed (char '\'') (char '\'') litChar <|> ("" <$ try (string "\"\"")) <|> ("" <$ try (string "''")) <|> manyChar (escapedChar' <|> noneOf " \t\n\r}") @@ -1797,11 +1798,11 @@ source = do skipSpaces let urlChunk = try parenthesizedChars - <|> (notFollowedBy (oneOf " )") >> countChar 1 litChar) + <|> (notFollowedBy (oneOf " )") >> litChar) <|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')")) let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk let betweenAngles = try $ - char '<' >> manyTillChar litChar (char '>') + char '<' >> mconcat <$> (manyTill litChar (char '>')) src <- try betweenAngles <|> sourceURL tit <- option "" $ try $ spnl >> linkTitle skipSpaces diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index aeca7575a..9e7f7e6e5 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -796,8 +796,8 @@ notAfterForbiddenBorderChar = do subOrSuperExpr :: PandocMonad m => OrgParser m (F Inlines) subOrSuperExpr = try $ simpleSubOrSuperText <|> - (choice [ charsInBalanced '{' '}' (noneOf "\n\r") - , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + (choice [ charsInBalanced '{' '}' (T.singleton <$> noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (T.singleton <$> noneOf "\n\r") ] >>= parseFromString (mconcat <$> many inline)) where enclosing (left, right) s = T.cons left $ T.snoc s right diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs index ccd1509ae..429b90ce1 100644 --- a/src/Text/Pandoc/Readers/Roff.hs +++ b/src/Text/Pandoc/Readers/Roff.hs @@ -503,7 +503,7 @@ lexConditional mname = do expression :: PandocMonad m => RoffLexer m (Maybe Bool) expression = do - raw <- charsInBalanced '(' ')' (satisfy (/= '\n')) + raw <- charsInBalanced '(' ')' (T.singleton <$> (satisfy (/= '\n'))) <|> many1Char nonspaceChar returnValue $ case raw of diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs index 8bba600a2..0fb54120b 100644 --- a/src/Text/Pandoc/Readers/TWiki.hs +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -447,7 +447,7 @@ autoLink = try $ do | otherwise = isAlphaNum c str :: PandocMonad m => TWParser m B.Inlines -str = B.str <$> (many1Char alphaNum <|> countChar 1 characterReference) +str = B.str <$> (many1Char alphaNum <|> characterReference) nop :: PandocMonad m => TWParser m B.Inlines nop = try $ (void exclamation <|> void nopTag) >> followContent diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index bd96bb403..8922d2b35 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -445,7 +445,7 @@ inlineParsers = [ str , link , image , mark - , B.str . T.singleton <$> characterReference + , B.str <$> characterReference , smartPunctuation inline , symbol ] diff --git a/src/Text/Pandoc/Readers/TikiWiki.hs b/src/Text/Pandoc/Readers/TikiWiki.hs index f009fad61..b4f8a214e 100644 --- a/src/Text/Pandoc/Readers/TikiWiki.hs +++ b/src/Text/Pandoc/Readers/TikiWiki.hs @@ -585,7 +585,7 @@ noparse = try $ do return $ B.str $ T.pack body str :: PandocMonad m => TikiWikiParser m B.Inlines -str = fmap B.str (T.pack <$> many1 alphaNum <|> countChar 1 characterReference) +str = fmap B.str (T.pack <$> many1 alphaNum <|> characterReference) symbol :: PandocMonad m => TikiWikiParser m B.Inlines symbol = fmap B.str (countChar 1 nonspaceChar) |
