diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2023-06-27 11:12:58 -0700 |
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2023-06-27 11:12:58 -0700 |
| commit | 67970d244e555e249d92f44c28ac3e683129b55d (patch) | |
| tree | 9d87ad4f8b830db10f20f39bf83af61209433a38 /src | |
| parent | 6eaa631185a62fa5c86fd16faf94b98dc6c0a962 (diff) | |
Text.Pandoc.Class: add `toTextM`.
This is like `Text.Pandoc.UTF8.toText`, except:
- it takes a file path as first argument, in addition to
bytestring contents
- it raises an informative error with source position if
the contents are not UTF8-encoded
[API change]
This replaces `utf8ToText` in `Text.Pandoc.App.Input`.
See #8884.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/App/Input.hs | 25 | ||||
| -rw-r--r-- | src/Text/Pandoc/Class/PandocMonad.hs | 25 |
2 files changed, 28 insertions, 22 deletions
diff --git a/src/Text/Pandoc/App/Input.hs b/src/Text/Pandoc/App/Input.hs index 8ae81495a..4c51e1faf 100644 --- a/src/Text/Pandoc/App/Input.hs +++ b/src/Text/Pandoc/App/Input.hs @@ -18,7 +18,7 @@ import Control.Monad ((>=>)) import Control.Monad.Except (throwError, catchError) import Data.Text (Text) import Network.URI (URI (..), parseURI, unEscapeString) -import Text.Pandoc.Class ( PandocMonad, openURL +import Text.Pandoc.Class ( PandocMonad, openURL, toTextM , readFileStrict, readStdinStrict, report) import Text.Pandoc.Definition (Pandoc (..), Attr, Block (..), Inline (..)) import Text.Pandoc.Error (PandocError (..)) @@ -26,15 +26,13 @@ import Text.Pandoc.Logging (LogMessage (..)) import Text.Pandoc.MIME (getCharset, MimeType) import Text.Pandoc.Options (Extensions, ReaderOptions (..)) import Text.Pandoc.Readers (Reader (..)) -import Text.Pandoc.Shared (tabFilter, textToIdentifier, tshow) +import Text.Pandoc.Shared (tabFilter, textToIdentifier) import Text.Pandoc.URI (uriPathToPath) import Text.Pandoc.Walk (walk) import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString.Lazy as BL import qualified Data.Text as T -import qualified Data.Text.Encoding as TSE -import qualified Data.Text.Encoding.Error as TSE -- | Settings specifying how and which input should be processed. data InputParameters m = InputParameters @@ -97,21 +95,6 @@ readSource src = readFileStrict (uriPathToPath $ T.pack $ uriPath u) _ -> (,Nothing) <$> readFileStrict src -utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text -utf8ToText fp bs = - case TSE.decodeUtf8' . dropBOM $ bs of - Left (TSE.DecodeError _ (Just w)) -> - case BS.elemIndex w bs of - Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w - Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w - Left e -> throwError $ PandocAppError (tshow e) - Right t -> return t - where - dropBOM bs' = - if "\xEF\xBB\xBF" `BS.isPrefixOf` bs' - then BS.drop 3 bs' - else bs' - inputToText :: PandocMonad m => (Text -> Text) -> (FilePath, (BS.ByteString, Maybe MimeType)) @@ -119,11 +102,11 @@ inputToText :: PandocMonad m inputToText convTabs (fp, (bs,mt)) = (fp,) . convTabs . T.filter (/='\r') <$> case mt >>= getCharset of - Just "UTF-8" -> utf8ToText fp bs + Just "UTF-8" -> toTextM fp bs Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs Just charset -> throwError $ PandocUnsupportedCharsetError charset Nothing -> catchError - (utf8ToText fp bs) + (toTextM fp bs) (\case PandocUTF8DecodingError{} -> do report $ NotUTF8Encoded diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs index 534ce74a9..6228c296b 100644 --- a/src/Text/Pandoc/Class/PandocMonad.hs +++ b/src/Text/Pandoc/Class/PandocMonad.hs @@ -47,6 +47,7 @@ module Text.Pandoc.Class.PandocMonad , setResourcePath , getResourcePath , readMetadataFile + , toTextM , fillMediaBag , toLang , makeCanonical @@ -74,7 +75,7 @@ import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.MIME (MimeType, getMimeType) import Text.Pandoc.MediaBag (MediaBag, lookupMedia, MediaItem(..)) -import Text.Pandoc.Shared (safeRead, makeCanonical) +import Text.Pandoc.Shared (safeRead, makeCanonical, tshow) import Text.Pandoc.URI (uriPathToPath) import Text.Pandoc.Walk (walkM) import Text.Parsec (ParsecT, getPosition, sourceLine, sourceName) @@ -84,6 +85,8 @@ import qualified Data.Text as T import qualified Debug.Trace import qualified Text.Pandoc.MediaBag as MB import qualified Text.Pandoc.UTF8 as UTF8 +import qualified Data.Text.Encoding as TSE +import qualified Data.Text.Encoding.Error as TSE -- | The PandocMonad typeclass contains all the potentially -- IO-related functions used in pandoc's readers and writers. @@ -403,6 +406,26 @@ withPaths (p:ps) action fp = catchError ((p </> fp,) <$> action (p </> fp)) (\_ -> withPaths ps action fp) +-- | A variant of Text.Pandoc.UTF8.toText that takes a FilePath +-- as well as the file's contents as parameter, and traps UTF8 +-- decoding errors so it can issue a more informative PandocUTF8DecodingError +-- with source position. +toTextM :: PandocMonad m => FilePath -> B.ByteString -> m T.Text +toTextM fp bs = + case TSE.decodeUtf8' . dropBOM $ bs of + Left (TSE.DecodeError _ (Just w)) -> + case B.elemIndex w bs of + Just offset -> + throwError $ PandocUTF8DecodingError (T.pack fp) offset w + Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w + Left e -> throwError $ PandocAppError (tshow e) + Right t -> return t + where + dropBOM bs' = + if "\xEF\xBB\xBF" `B.isPrefixOf` bs' + then B.drop 3 bs' + else bs' + -- | Returns @fp@ if the file exists in the current directory; otherwise -- searches for the data file relative to @/subdir/@. Returns @Nothing@ -- if neither file exists. |
