Text.Pandoc.Class: add `toTextM`.

This is like `Text.Pandoc.UTF8.toText`, except: - it takes a file path as first argument, in addition to bytestring contents - it raises an informative error with source position if the contents are not UTF8-encoded [API change] This replaces `utf8ToText` in `Text.Pandoc.App.Input`. See #8884.
author: John MacFarlane <jgm@berkeley.edu> 2023-06-27 11:12:58 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2023-06-27 11:12:58 -0700
commit: 67970d244e555e249d92f44c28ac3e683129b55d (patch)
tree: 9d87ad4f8b830db10f20f39bf83af61209433a38 /src
parent: 6eaa631185a62fa5c86fd16faf94b98dc6c0a962 (diff)
2 files changed, 28 insertions, 22 deletions
diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs
index 8ae81495a..4c51e1faf 100644
--- a/src/Text/Pandoc/App/Input.hs
+++ b/src/Text/Pandoc/App/Input.hs
@@ -18,7 +18,7 @@ import Control.Monad ((>=>))
 import Control.Monad.Except (throwError, catchError)
 import Data.Text (Text)
 import Network.URI (URI (..), parseURI, unEscapeString)
-import Text.Pandoc.Class ( PandocMonad, openURL
+import Text.Pandoc.Class ( PandocMonad, openURL, toTextM
                          , readFileStrict, readStdinStrict, report)
 import Text.Pandoc.Definition (Pandoc (..), Attr, Block (..), Inline (..))
 import Text.Pandoc.Error (PandocError (..))
@@ -26,15 +26,13 @@ import Text.Pandoc.Logging (LogMessage (..))
 import Text.Pandoc.MIME (getCharset, MimeType)
 import Text.Pandoc.Options (Extensions, ReaderOptions (..))
 import Text.Pandoc.Readers (Reader (..))
-import Text.Pandoc.Shared (tabFilter, textToIdentifier, tshow)
+import Text.Pandoc.Shared (tabFilter, textToIdentifier)
 import Text.Pandoc.URI (uriPathToPath)
 import Text.Pandoc.Walk (walk)
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Char8 as B8
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
-import qualified Data.Text.Encoding as TSE
-import qualified Data.Text.Encoding.Error as TSE
 
 -- | Settings specifying how and which input should be processed.
 data InputParameters m = InputParameters
@@ -97,21 +95,6 @@ readSource src =
                  readFileStrict (uriPathToPath $ T.pack $ uriPath u)
     _       -> (,Nothing) <$> readFileStrict src
 
-utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
-utf8ToText fp bs =
-  case TSE.decodeUtf8' . dropBOM $ bs of
-    Left (TSE.DecodeError _ (Just w)) ->
-      case BS.elemIndex w bs of
-        Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
-        Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
-    Left e -> throwError $ PandocAppError (tshow e)
-    Right t -> return t
- where
-   dropBOM bs' =
-     if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
-        then BS.drop 3 bs'
-        else bs'
-
 inputToText :: PandocMonad m
             => (Text -> Text)
             -> (FilePath, (BS.ByteString, Maybe MimeType))
@@ -119,11 +102,11 @@ inputToText :: PandocMonad m
 inputToText convTabs (fp, (bs,mt)) =
   (fp,) . convTabs . T.filter (/='\r') <$>
   case mt >>= getCharset of
-    Just "UTF-8"      -> utf8ToText fp bs
+    Just "UTF-8"      -> toTextM fp bs
     Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
     Just charset      -> throwError $ PandocUnsupportedCharsetError charset
     Nothing           -> catchError
-                           (utf8ToText fp bs)
+                           (toTextM fp bs)
                            (\case
                               PandocUTF8DecodingError{} -> do
                                 report $ NotUTF8Encoded
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 534ce74a9..6228c296b 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -47,6 +47,7 @@ module Text.Pandoc.Class.PandocMonad
   , setResourcePath
   , getResourcePath
   , readMetadataFile
+  , toTextM
   , fillMediaBag
   , toLang
   , makeCanonical
@@ -74,7 +75,7 @@ import Text.Pandoc.Error
 import Text.Pandoc.Logging
 import Text.Pandoc.MIME (MimeType, getMimeType)
 import Text.Pandoc.MediaBag (MediaBag, lookupMedia, MediaItem(..))
-import Text.Pandoc.Shared (safeRead, makeCanonical)
+import Text.Pandoc.Shared (safeRead, makeCanonical, tshow)
 import Text.Pandoc.URI (uriPathToPath)
 import Text.Pandoc.Walk (walkM)
 import Text.Parsec (ParsecT, getPosition, sourceLine, sourceName)
@@ -84,6 +85,8 @@ import qualified Data.Text as T
 import qualified Debug.Trace
 import qualified Text.Pandoc.MediaBag as MB
 import qualified Text.Pandoc.UTF8 as UTF8
+import qualified Data.Text.Encoding as TSE
+import qualified Data.Text.Encoding.Error as TSE
 
 -- | The PandocMonad typeclass contains all the potentially
 -- IO-related functions used in pandoc's readers and writers.
@@ -403,6 +406,26 @@ withPaths (p:ps) action fp =
   catchError ((p </> fp,) <$> action (p </> fp))
              (\_ -> withPaths ps action fp)
 
+-- | A variant of Text.Pandoc.UTF8.toText that takes a FilePath
+-- as well as the file's contents as parameter, and traps UTF8
+-- decoding errors so it can issue a more informative PandocUTF8DecodingError
+-- with source position.
+toTextM :: PandocMonad m => FilePath -> B.ByteString -> m T.Text
+toTextM fp bs =
+  case TSE.decodeUtf8' . dropBOM $ bs of
+    Left (TSE.DecodeError _ (Just w)) ->
+      case B.elemIndex w bs of
+        Just offset ->
+          throwError $ PandocUTF8DecodingError (T.pack fp) offset w
+        Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
+    Left e -> throwError $ PandocAppError (tshow e)
+    Right t -> return t
+ where
+   dropBOM bs' =
+     if "\xEF\xBB\xBF" `B.isPrefixOf` bs'
+        then B.drop 3 bs'
+        else bs'
+
 -- | Returns @fp@ if the file exists in the current directory; otherwise
 -- searches for the data file relative to @/subdir/@. Returns @Nothing@
 -- if neither file exists.
author	John MacFarlane <jgm@berkeley.edu>	2023-06-27 11:12:58 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2023-06-27 11:12:58 -0700
commit	67970d244e555e249d92f44c28ac3e683129b55d (patch)
tree	9d87ad4f8b830db10f20f39bf83af61209433a38 /src
parent	6eaa631185a62fa5c86fd16faf94b98dc6c0a962 (diff)