-- | Text IO using the UTF8 character encoding.

module Agda.Utils.IO.UTF8
  ( ReadException
  , readTextFile
  , Agda.Utils.IO.UTF8.readFile
  , Agda.Utils.IO.UTF8.writeFile
  , writeTextToFile
  ) where

import Control.Exception
import Data.Maybe (fromMaybe)
import Data.Text.Lazy (Text)
import qualified Data.Text.Lazy as T
import qualified Data.Text.Lazy.Encoding as T
import qualified Data.Text.Lazy.IO as T
import qualified Data.ByteString.Lazy as BS
import qualified System.IO as IO

-- | Converts many character sequences which may be interpreted as
-- line or paragraph separators into '\n'.

convertLineEndings :: Text -> Text
convertLineEndings :: Text -> Text
convertLineEndings = (Char -> Char) -> Text -> Text
T.map Char -> Char
convert (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
convertCRLF
  where
  -- Replaces CR LF with LF.
  convertCRLF :: Text -> Text
convertCRLF = HasCallStack => Text -> Text -> Text -> Text
Text -> Text -> Text -> Text
T.replace Text
"\x000D\n" Text
"\n"

  -- ASCII:
  convert :: Char -> Char
convert Char
'\x000D' = Char
'\n'  -- CR  (Carriage return)
  convert Char
'\x000C' = Char
'\n'  -- FF  (Form feed)
  -- Unicode:
  convert Char
'\x0085' = Char
'\n'  -- NEXT LINE
  convert Char
'\x2028' = Char
'\n'  -- LINE SEPARATOR
  convert Char
'\x2029' = Char
'\n'  -- PARAGRAPH SEPARATOR
  -- Not a line ending (or '\x000A'):
  convert Char
c        = Char
c

-- | Strip the byte order mark (BOM) from a Text.
--
-- - https://github.com/agda/agda/issues/6524
-- - https://github.com/haskell-hvr/cassava/issues/106#issuecomment-373986176
--
stripUtf8Bom :: BS.ByteString -> BS.ByteString
stripUtf8Bom :: ByteString -> ByteString
stripUtf8Bom ByteString
bs = ByteString -> Maybe ByteString -> ByteString
forall a. a -> Maybe a -> a
fromMaybe ByteString
bs (ByteString -> ByteString -> Maybe ByteString
BS.stripPrefix ByteString
"\239\187\191" ByteString
bs)

-- | A kind of exception that can be thrown by 'readTextFile' and
-- 'readFile'.
newtype ReadException
  = DecodingError FilePath
    -- ^ Decoding failed for the given file.
  deriving Int -> ReadException -> ShowS
[ReadException] -> ShowS
ReadException -> String
(Int -> ReadException -> ShowS)
-> (ReadException -> String)
-> ([ReadException] -> ShowS)
-> Show ReadException
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> ReadException -> ShowS
showsPrec :: Int -> ReadException -> ShowS
$cshow :: ReadException -> String
show :: ReadException -> String
$cshowList :: [ReadException] -> ShowS
showList :: [ReadException] -> ShowS
Show

instance Exception ReadException where
  displayException :: ReadException -> String
displayException (DecodingError String
file) =
    String
"Failed to read " String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
file String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
".\n" String -> ShowS
forall a. [a] -> [a] -> [a]
++
    String
"Please ensure that this file uses the UTF-8 character encoding."

-- | Reads a UTF8-encoded text file and converts many character
-- sequences which may be interpreted as line or paragraph separators
-- into '\n'.
--
-- If the file cannot be decoded, then a 'ReadException' is raised.

readTextFile :: FilePath -> IO Text
readTextFile :: String -> IO Text
readTextFile String
file = do
  Either UnicodeException Text
s <- ByteString -> Either UnicodeException Text
T.decodeUtf8' (ByteString -> Either UnicodeException Text)
-> (ByteString -> ByteString)
-> ByteString
-> Either UnicodeException Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
stripUtf8Bom (ByteString -> Either UnicodeException Text)
-> IO ByteString -> IO (Either UnicodeException Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String -> IO ByteString
BS.readFile String
file
  case Either UnicodeException Text
s of
    Right Text
s -> Text -> IO Text
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
convertLineEndings Text
s
    Left UnicodeException
_  -> ReadException -> IO Text
forall a e. Exception e => e -> a
throw (ReadException -> IO Text) -> ReadException -> IO Text
forall a b. (a -> b) -> a -> b
$ String -> ReadException
DecodingError String
file

-- | Reads a UTF8-encoded text file and converts many character
-- sequences which may be interpreted as line or paragraph separators
-- into '\n'.
--
-- If the file cannot be decoded, then a 'ReadException' is raised.

readFile :: FilePath -> IO String
readFile :: String -> IO String
readFile String
f = do
  Text
s <- String -> IO Text
readTextFile String
f
  String -> IO String
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (String -> IO String) -> String -> IO String
forall a b. (a -> b) -> a -> b
$ Text -> String
T.unpack Text
s

-- | Writes a UTF8-encoded text file. The native convention for line
-- endings is used.

writeFile :: FilePath -> String -> IO ()
writeFile :: String -> String -> IO ()
writeFile String
file String
s = String -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
IO.withFile String
file IOMode
IO.WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
h -> do
  Handle -> TextEncoding -> IO ()
IO.hSetEncoding Handle
h TextEncoding
IO.utf8
  Handle -> String -> IO ()
IO.hPutStr Handle
h String
s

-- | Writes a UTF8-encoded text file. The native convention for line
-- endings is used.

writeTextToFile :: FilePath -> Text -> IO ()
writeTextToFile :: String -> Text -> IO ()
writeTextToFile String
file Text
s = String -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. String -> IOMode -> (Handle -> IO r) -> IO r
IO.withFile String
file IOMode
IO.WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
h -> do
  Handle -> TextEncoding -> IO ()
IO.hSetEncoding Handle
h TextEncoding
IO.utf8
  Handle -> Text -> IO ()
T.hPutStr Handle
h Text
s