summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-06-21 22:35:07 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2021-06-21 22:35:07 -0700
commit0352f7845bfa2053797850c3639414978285b63e (patch)
treee25772594c18c114debc2ebf3d94160855498192
parent2ef2049b4e94dc51961e75edb27af1d2f83acd3b (diff)
Improve emailAddress in Text.Pandoc.Parsing.
Previously the parser would accept characters in domains that are illegal in domains, and this sometimes caused it to gobble bits of the following text. Closes #7398. Note that this change, by itself, caused some txt2tag reader tests to fail. txt2tags allows bare email addresses with a following form query. So, in addition to the change to emailAddress, we modify the txt2tags parser so it can still handle these cases.
-rw-r--r--src/Text/Pandoc/Parsing.hs7
-rw-r--r--src/Text/Pandoc/Readers/Txt2Tags.hs22
2 files changed, 24 insertions, 5 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 0bb794ba1..082d9565b 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -693,13 +693,12 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain)
mailbox = intercalate "." <$> (emailWord `sepBy1'` dot)
domain = intercalate "." <$> (subdomain `sepBy1'` dot)
dot = char '.'
- subdomain = many1 $ alphaNum <|> innerPunct
+ subdomain = many1 $ alphaNum <|> innerPunct (=='-')
-- this excludes some valid email addresses, since an
-- email could contain e.g. '__', but gives better results
-- for our purposes, when combined with markdown parsing:
- innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@')
- <* notFollowedBy space
- <* notFollowedBy (satisfy isPunctuation))
+ innerPunct f = try (satisfy f
+ <* notFollowedBy (satisfy (not . isAlphaNum)))
-- technically an email address could begin with a symbol,
-- but allowing this creates too many problems.
-- See e.g. https://github.com/jgm/pandoc/issues/2940
diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs
index 6f92f0063..b5cf5a0f3 100644
--- a/src/Text/Pandoc/Readers/Txt2Tags.hs
+++ b/src/Text/Pandoc/Readers/Txt2Tags.hs
@@ -478,9 +478,29 @@ macro = try $ do
-- raw URLs in text are automatically linked
url :: T2T Inlines
url = try $ do
- (rawUrl, escapedUrl) <- try uri <|> emailAddress
+ (rawUrl, escapedUrl) <- try uri <|> emailAddress'
return $ B.link rawUrl "" (B.str escapedUrl)
+emailAddress' :: T2T (Text, Text)
+emailAddress' = do
+ (base, mailURI) <- emailAddress
+ query <- option "" emailQuery
+ return (base <> query, mailURI <> query)
+
+emailQuery :: T2T Text
+emailQuery = do
+ char '?'
+ parts <- kv `sepBy1` (char '&')
+ return $ "?" <> T.intercalate "&" parts
+
+kv :: T2T Text
+kv = do
+ k <- T.pack <$> many1 alphaNum
+ char '='
+ let vchar = alphaNum <|> try (oneOf "%._/~:,=$@&+-;*" <* lookAhead alphaNum)
+ v <- T.pack <$> many1 vchar
+ return (k <> "=" <> v)
+
uri :: T2T (Text, Text)
uri = try $ do
address <- t2tURI