summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2023-06-27 11:12:58 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2023-06-27 11:12:58 -0700
commit67970d244e555e249d92f44c28ac3e683129b55d (patch)
tree9d87ad4f8b830db10f20f39bf83af61209433a38 /src
parent6eaa631185a62fa5c86fd16faf94b98dc6c0a962 (diff)
Text.Pandoc.Class: add `toTextM`.
This is like `Text.Pandoc.UTF8.toText`, except: - it takes a file path as first argument, in addition to bytestring contents - it raises an informative error with source position if the contents are not UTF8-encoded [API change] This replaces `utf8ToText` in `Text.Pandoc.App.Input`. See #8884.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/App/Input.hs25
-rw-r--r--src/Text/Pandoc/Class/PandocMonad.hs25
2 files changed, 28 insertions, 22 deletions
diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs
index 8ae81495a..4c51e1faf 100644
--- a/src/Text/Pandoc/App/Input.hs
+++ b/src/Text/Pandoc/App/Input.hs
@@ -18,7 +18,7 @@ import Control.Monad ((>=>))
import Control.Monad.Except (throwError, catchError)
import Data.Text (Text)
import Network.URI (URI (..), parseURI, unEscapeString)
-import Text.Pandoc.Class ( PandocMonad, openURL
+import Text.Pandoc.Class ( PandocMonad, openURL, toTextM
, readFileStrict, readStdinStrict, report)
import Text.Pandoc.Definition (Pandoc (..), Attr, Block (..), Inline (..))
import Text.Pandoc.Error (PandocError (..))
@@ -26,15 +26,13 @@ import Text.Pandoc.Logging (LogMessage (..))
import Text.Pandoc.MIME (getCharset, MimeType)
import Text.Pandoc.Options (Extensions, ReaderOptions (..))
import Text.Pandoc.Readers (Reader (..))
-import Text.Pandoc.Shared (tabFilter, textToIdentifier, tshow)
+import Text.Pandoc.Shared (tabFilter, textToIdentifier)
import Text.Pandoc.URI (uriPathToPath)
import Text.Pandoc.Walk (walk)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
-import qualified Data.Text.Encoding as TSE
-import qualified Data.Text.Encoding.Error as TSE
-- | Settings specifying how and which input should be processed.
data InputParameters m = InputParameters
@@ -97,21 +95,6 @@ readSource src =
readFileStrict (uriPathToPath $ T.pack $ uriPath u)
_ -> (,Nothing) <$> readFileStrict src
-utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
-utf8ToText fp bs =
- case TSE.decodeUtf8' . dropBOM $ bs of
- Left (TSE.DecodeError _ (Just w)) ->
- case BS.elemIndex w bs of
- Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
- Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
- Left e -> throwError $ PandocAppError (tshow e)
- Right t -> return t
- where
- dropBOM bs' =
- if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
- then BS.drop 3 bs'
- else bs'
-
inputToText :: PandocMonad m
=> (Text -> Text)
-> (FilePath, (BS.ByteString, Maybe MimeType))
@@ -119,11 +102,11 @@ inputToText :: PandocMonad m
inputToText convTabs (fp, (bs,mt)) =
(fp,) . convTabs . T.filter (/='\r') <$>
case mt >>= getCharset of
- Just "UTF-8" -> utf8ToText fp bs
+ Just "UTF-8" -> toTextM fp bs
Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
Just charset -> throwError $ PandocUnsupportedCharsetError charset
Nothing -> catchError
- (utf8ToText fp bs)
+ (toTextM fp bs)
(\case
PandocUTF8DecodingError{} -> do
report $ NotUTF8Encoded
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 534ce74a9..6228c296b 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -47,6 +47,7 @@ module Text.Pandoc.Class.PandocMonad
, setResourcePath
, getResourcePath
, readMetadataFile
+ , toTextM
, fillMediaBag
, toLang
, makeCanonical
@@ -74,7 +75,7 @@ import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.MIME (MimeType, getMimeType)
import Text.Pandoc.MediaBag (MediaBag, lookupMedia, MediaItem(..))
-import Text.Pandoc.Shared (safeRead, makeCanonical)
+import Text.Pandoc.Shared (safeRead, makeCanonical, tshow)
import Text.Pandoc.URI (uriPathToPath)
import Text.Pandoc.Walk (walkM)
import Text.Parsec (ParsecT, getPosition, sourceLine, sourceName)
@@ -84,6 +85,8 @@ import qualified Data.Text as T
import qualified Debug.Trace
import qualified Text.Pandoc.MediaBag as MB
import qualified Text.Pandoc.UTF8 as UTF8
+import qualified Data.Text.Encoding as TSE
+import qualified Data.Text.Encoding.Error as TSE
-- | The PandocMonad typeclass contains all the potentially
-- IO-related functions used in pandoc's readers and writers.
@@ -403,6 +406,26 @@ withPaths (p:ps) action fp =
catchError ((p </> fp,) <$> action (p </> fp))
(\_ -> withPaths ps action fp)
+-- | A variant of Text.Pandoc.UTF8.toText that takes a FilePath
+-- as well as the file's contents as parameter, and traps UTF8
+-- decoding errors so it can issue a more informative PandocUTF8DecodingError
+-- with source position.
+toTextM :: PandocMonad m => FilePath -> B.ByteString -> m T.Text
+toTextM fp bs =
+ case TSE.decodeUtf8' . dropBOM $ bs of
+ Left (TSE.DecodeError _ (Just w)) ->
+ case B.elemIndex w bs of
+ Just offset ->
+ throwError $ PandocUTF8DecodingError (T.pack fp) offset w
+ Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
+ Left e -> throwError $ PandocAppError (tshow e)
+ Right t -> return t
+ where
+ dropBOM bs' =
+ if "\xEF\xBB\xBF" `B.isPrefixOf` bs'
+ then B.drop 3 bs'
+ else bs'
+
-- | Returns @fp@ if the file exists in the current directory; otherwise
-- searches for the data file relative to @/subdir/@. Returns @Nothing@
-- if neither file exists.