summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2022-10-29 11:18:27 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2022-10-29 11:46:03 -0700
commit512cf5a908872c9b3332718aa3272ad09914da16 (patch)
treeccd7ca935a94ddaf95c2af92fa7a5a6188cd1909 /src
parent82072f588c9f45024b0a13ede0c058e30469b3be (diff)
T.P.Parsing.General: change `characterReference`, `charsInBalanced`.
`characterReference` now returns a Text (as it should, because some named references don't correspond to a single Char), and uses the `lookupEntity` function from commonmark-hs instead of the slow one from tagsoup. `charsInBalanced` now takes a Text parser rather than a Char parser as argument. [API change]
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Parsing/Citations.hs3
-rw-r--r--src/Text/Pandoc/Parsing/General.hs31
-rw-r--r--src/Text/Pandoc/Parsing/Smart.hs9
-rw-r--r--src/Text/Pandoc/Readers/DokuWiki.hs2
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs23
-rw-r--r--src/Text/Pandoc/Readers/Org/Inlines.hs4
-rw-r--r--src/Text/Pandoc/Readers/Roff.hs2
-rw-r--r--src/Text/Pandoc/Readers/TWiki.hs2
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs2
-rw-r--r--src/Text/Pandoc/Readers/TikiWiki.hs2
10 files changed, 37 insertions, 43 deletions
diff --git a/src/Text/Pandoc/Parsing/Citations.hs b/src/Text/Pandoc/Parsing/Citations.hs
index a086951aa..f3c62c03c 100644
--- a/src/Text/Pandoc/Parsing/Citations.hs
+++ b/src/Text/Pandoc/Parsing/Citations.hs
@@ -41,7 +41,8 @@ citeKey allowBraced = try $ do
char '@'
key <- simpleCiteIdentifier
<|> if allowBraced
- then charsInBalanced '{' '}' (satisfy (not . isSpace))
+ then charsInBalanced '{' '}'
+ (T.singleton <$> (satisfy (not . isSpace)))
else mzero
return (suppress_author, key)
diff --git a/src/Text/Pandoc/Parsing/General.hs b/src/Text/Pandoc/Parsing/General.hs
index 397a3e94f..e7f8c16b3 100644
--- a/src/Text/Pandoc/Parsing/General.hs
+++ b/src/Text/Pandoc/Parsing/General.hs
@@ -85,7 +85,7 @@ import Data.List (intercalate, sortOn)
import Data.Ord (Down(..))
import Data.Maybe (fromMaybe)
import Data.Text (Text)
-import Text.HTML.TagSoup.Entity (lookupEntity)
+import Commonmark.Entity (lookupEntity)
import Text.Pandoc.Asciify (toAsciiText)
import Text.Pandoc.Builder (Attr, Inline(Str), Inlines, trimInlines)
import Text.Pandoc.Class.PandocMonad (PandocMonad, readFileFromDirs, report)
@@ -449,21 +449,16 @@ lineClump = blanklines
-- pairs of open and close, which must be different. For example,
-- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))"
-- and return "hello (there)".
-charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) => Char -> Char -> ParsecT s st m Char
- -> ParsecT s st m Text
+charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char)
+ => Char -> Char -> ParsecT s st m Text -> ParsecT s st m Text
charsInBalanced open close parser = try $ do
char open
let isDelim c = c == open || c == close
- raw <- many $ T.pack <$> many1 (notFollowedBy (satisfy isDelim) >> parser)
+ raw <- many $ mconcat <$> many1 (notFollowedBy (satisfy isDelim) >> parser)
<|> (do res <- charsInBalanced open close parser
return $ T.singleton open <> res <> T.singleton close)
char close
- return $ T.concat raw
-
--- old charsInBalanced would be:
--- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline)
--- old charsInBalanced' would be:
--- charsInBalanced open close anyChar
+ return $ mconcat raw
-- Parsers for email addresses and URIs
@@ -530,7 +525,7 @@ uri = try $ do
wordChar = satisfy isWordChar
percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit
- entity = try $ pure <$> characterReference
+ entity = try $ T.unpack <$> characterReference
punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>'))
uriChunk = many1 wordChar
<|> percentEscaped
@@ -585,21 +580,17 @@ escaped :: (Stream s m Char, UpdateSourcePos s Char)
escaped parser = try $ char '\\' >> parser
-- | Parse character entity.
-characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s st m Char
+characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s st m Text
characterReference = try $ do
char '&'
- ent <- many1Till nonspaceChar (char ';')
- let ent' = case ent of
- '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug
- '#':_ -> ent
- _ -> ent ++ ";"
- case lookupEntity ent' of
- Just (c : _) -> return c
+ ent <- many1TillChar nonspaceChar (char ';')
+ case lookupEntity (ent <> ";") of
+ Just t -> return t
_ -> Prelude.fail "entity not found"
-- | Parses a character reference and returns a Str element.
charRef :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s st m Inline
-charRef = Str . T.singleton <$> characterReference
+charRef = Str <$> characterReference
lineBlockLine :: Monad m => ParsecT Sources st m Text
lineBlockLine = try $ do
diff --git a/src/Text/Pandoc/Parsing/Smart.hs b/src/Text/Pandoc/Parsing/Smart.hs
index fdc824e2c..e9fe9f7a7 100644
--- a/src/Text/Pandoc/Parsing/Smart.hs
+++ b/src/Text/Pandoc/Parsing/Smart.hs
@@ -43,7 +43,7 @@ import Text.Parsec
, notFollowedBy
, try
)
-
+import qualified Data.Text as T
import qualified Text.Pandoc.Builder as B
-- | Parses various ASCII punctuation, quotes, and apostrophe in a smart
@@ -95,9 +95,10 @@ doubleQuoted inlineParser = do
charOrRef :: (Stream s m Char, UpdateSourcePos s Char) => [Char] -> ParsecT s st m Char
charOrRef cs =
- oneOf cs <|> try (do c <- characterReference
- guard (c `elem` cs)
- return c)
+ oneOf cs <|> try (do t <- characterReference
+ case T.unpack t of
+ [c] | c `elem` cs -> return c
+ _ -> fail "unexpected character reference")
-- | Succeeds if the parser is
--
diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs
index 3d52cd747..1ca4cf696 100644
--- a/src/Text/Pandoc/Readers/DokuWiki.hs
+++ b/src/Text/Pandoc/Readers/DokuWiki.hs
@@ -244,7 +244,7 @@ nocache :: PandocMonad m => DWParser m B.Inlines
nocache = try $ mempty <$ string "~~NOCACHE~~"
str :: PandocMonad m => DWParser m B.Inlines
-str = B.str <$> (many1Char alphaNum <|> countChar 1 characterReference)
+str = B.str <$> (many1Char alphaNum <|> characterReference)
symbol :: PandocMonad m => DWParser m B.Inlines
symbol = B.str <$> countChar 1 nonspaceChar
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index d7517f24c..42df8b985 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -189,11 +189,11 @@ skipNonindentSpaces = do
tabStop <- getOption readerTabStop
gobbleAtMostSpaces (tabStop - 1) <* notFollowedBy spaceChar
-litChar :: PandocMonad m => MarkdownParser m Char
-litChar = escapedChar'
+litChar :: PandocMonad m => MarkdownParser m Text
+litChar = T.singleton <$> escapedChar'
<|> characterReference
- <|> noneOf "\n"
- <|> try (newline >> notFollowedBy blankline >> return ' ')
+ <|> T.singleton <$> noneOf "\n"
+ <|> try (newline >> notFollowedBy blankline >> return " ")
-- | Parse a sequence of inline elements between square brackets,
-- including inlines between balanced pairs of square brackets.
@@ -356,8 +356,9 @@ referenceKey = try $ do
notFollowedBy' $ guardEnabled Ext_mmd_link_attributes >>
try (spnl <* keyValAttr)
notFollowedBy' (() <$ reference)
- many1Char $ notFollowedBy space >> litChar
- let betweenAngles = try $ char '<' >> manyTillChar litChar (char '>')
+ mconcat <$> many1 (notFollowedBy space *> litChar)
+ let betweenAngles = try $ char '<' >>
+ mconcat <$> (manyTill litChar (char '>'))
rebase <- option False (True <$ guardEnabled Ext_rebase_relative_paths)
src <- (if rebase then rebasePath pos else id) <$>
(try betweenAngles <|> sourceURL)
@@ -395,7 +396,7 @@ quotedTitle c = try $ do
char c
notFollowedBy spaces
let pEnder = try $ char c >> notFollowedBy (satisfy isAlphaNum)
- let regChunk = many1Char (noneOf ['\\','\n','&',c]) <|> countChar 1 litChar
+ let regChunk = many1Char (noneOf ['\\','\n','&',c]) <|> litChar
let nestedChunk = (\x -> T.singleton c <> x <> T.singleton c) <$> quotedTitle c
T.unwords . T.words . T.concat <$> manyTill (nestedChunk <|> regChunk) pEnder
@@ -653,8 +654,8 @@ keyValAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr)
keyValAttr = try $ do
key <- identifier
char '='
- val <- T.pack <$> enclosed (char '"') (char '"') litChar
- <|> T.pack <$> enclosed (char '\'') (char '\'') litChar
+ val <- mconcat <$> enclosed (char '"') (char '"') litChar
+ <|> mconcat <$> enclosed (char '\'') (char '\'') litChar
<|> ("" <$ try (string "\"\""))
<|> ("" <$ try (string "''"))
<|> manyChar (escapedChar' <|> noneOf " \t\n\r}")
@@ -1797,11 +1798,11 @@ source = do
skipSpaces
let urlChunk =
try parenthesizedChars
- <|> (notFollowedBy (oneOf " )") >> countChar 1 litChar)
+ <|> (notFollowedBy (oneOf " )") >> litChar)
<|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')"))
let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
let betweenAngles = try $
- char '<' >> manyTillChar litChar (char '>')
+ char '<' >> mconcat <$> (manyTill litChar (char '>'))
src <- try betweenAngles <|> sourceURL
tit <- option "" $ try $ spnl >> linkTitle
skipSpaces
diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs
index aeca7575a..9e7f7e6e5 100644
--- a/src/Text/Pandoc/Readers/Org/Inlines.hs
+++ b/src/Text/Pandoc/Readers/Org/Inlines.hs
@@ -796,8 +796,8 @@ notAfterForbiddenBorderChar = do
subOrSuperExpr :: PandocMonad m => OrgParser m (F Inlines)
subOrSuperExpr = try $
simpleSubOrSuperText <|>
- (choice [ charsInBalanced '{' '}' (noneOf "\n\r")
- , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
+ (choice [ charsInBalanced '{' '}' (T.singleton <$> noneOf "\n\r")
+ , enclosing ('(', ')') <$> charsInBalanced '(' ')' (T.singleton <$> noneOf "\n\r")
] >>= parseFromString (mconcat <$> many inline))
where enclosing (left, right) s = T.cons left $ T.snoc s right
diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs
index ccd1509ae..429b90ce1 100644
--- a/src/Text/Pandoc/Readers/Roff.hs
+++ b/src/Text/Pandoc/Readers/Roff.hs
@@ -503,7 +503,7 @@ lexConditional mname = do
expression :: PandocMonad m => RoffLexer m (Maybe Bool)
expression = do
- raw <- charsInBalanced '(' ')' (satisfy (/= '\n'))
+ raw <- charsInBalanced '(' ')' (T.singleton <$> (satisfy (/= '\n')))
<|> many1Char nonspaceChar
returnValue $
case raw of
diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs
index 8bba600a2..0fb54120b 100644
--- a/src/Text/Pandoc/Readers/TWiki.hs
+++ b/src/Text/Pandoc/Readers/TWiki.hs
@@ -447,7 +447,7 @@ autoLink = try $ do
| otherwise = isAlphaNum c
str :: PandocMonad m => TWParser m B.Inlines
-str = B.str <$> (many1Char alphaNum <|> countChar 1 characterReference)
+str = B.str <$> (many1Char alphaNum <|> characterReference)
nop :: PandocMonad m => TWParser m B.Inlines
nop = try $ (void exclamation <|> void nopTag) >> followContent
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index bd96bb403..8922d2b35 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -445,7 +445,7 @@ inlineParsers = [ str
, link
, image
, mark
- , B.str . T.singleton <$> characterReference
+ , B.str <$> characterReference
, smartPunctuation inline
, symbol
]
diff --git a/src/Text/Pandoc/Readers/TikiWiki.hs b/src/Text/Pandoc/Readers/TikiWiki.hs
index f009fad61..b4f8a214e 100644
--- a/src/Text/Pandoc/Readers/TikiWiki.hs
+++ b/src/Text/Pandoc/Readers/TikiWiki.hs
@@ -585,7 +585,7 @@ noparse = try $ do
return $ B.str $ T.pack body
str :: PandocMonad m => TikiWikiParser m B.Inlines
-str = fmap B.str (T.pack <$> many1 alphaNum <|> countChar 1 characterReference)
+str = fmap B.str (T.pack <$> many1 alphaNum <|> characterReference)
symbol :: PandocMonad m => TikiWikiParser m B.Inlines
symbol = fmap B.str (countChar 1 nonspaceChar)