{- GHC File system encoding handling.
 -
 - Copyright 2012-2016 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}

{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fno-warn-tabs #-}

module Utility.FileSystemEncoding (
	useFileSystemEncoding,
	fileEncoding,
	withFilePath,
	RawFilePath,
	fromRawFilePath,
	toRawFilePath,
	decodeBS,
	encodeBS,
	decodeW8,
	encodeW8,
	encodeW8NUL,
	decodeW8NUL,
	truncateFilePath,
	s2w8,
	w82s,
	c2w8,
	w82c,
) where

import qualified GHC.Foreign as GHC
import qualified GHC.IO.Encoding as Encoding
import Foreign.C
import System.IO
import System.IO.Unsafe
import Data.Word
import Data.List
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
#ifdef mingw32_HOST_OS
import qualified Data.ByteString.Lazy.UTF8 as L8
#endif

import Utility.Exception
import Utility.Split

{- Makes all subsequent Handles that are opened, as well as stdio Handles,
 - use the filesystem encoding, instead of the encoding of the current
 - locale.
 -
 - The filesystem encoding allows "arbitrary undecodable bytes to be
 - round-tripped through it". This avoids encoded failures when data is not
 - encoded matching the current locale.
 -
 - Note that code can still use hSetEncoding to change the encoding of a
 - Handle. This only affects the default encoding.
 -}
useFileSystemEncoding :: IO ()
useFileSystemEncoding :: IO ()
useFileSystemEncoding = do
#ifndef mingw32_HOST_OS
	TextEncoding
e <- IO TextEncoding
Encoding.getFileSystemEncoding
#else
	{- The file system encoding does not work well on Windows,
	 - and Windows only has utf FilePaths anyway. -}
	let e = Encoding.utf8
#endif
	Handle -> TextEncoding -> IO ()
hSetEncoding Handle
stdin TextEncoding
e
	Handle -> TextEncoding -> IO ()
hSetEncoding Handle
stdout TextEncoding
e
	Handle -> TextEncoding -> IO ()
hSetEncoding Handle
stderr TextEncoding
e
	TextEncoding -> IO ()
Encoding.setLocaleEncoding TextEncoding
e	

fileEncoding :: Handle -> IO ()
#ifndef mingw32_HOST_OS
fileEncoding :: Handle -> IO ()
fileEncoding Handle
h = Handle -> TextEncoding -> IO ()
hSetEncoding Handle
h forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< IO TextEncoding
Encoding.getFileSystemEncoding
#else
fileEncoding h = hSetEncoding h Encoding.utf8
#endif

{- Marshal a Haskell FilePath into a NUL terminated C string using temporary
 - storage. The FilePath is encoded using the filesystem encoding,
 - reversing the decoding that should have been done when the FilePath
 - was obtained. -}
withFilePath :: FilePath -> (CString -> IO a) -> IO a
withFilePath :: forall a. FilePath -> (CString -> IO a) -> IO a
withFilePath FilePath
fp CString -> IO a
f = IO TextEncoding
Encoding.getFileSystemEncoding
	forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \TextEncoding
enc -> forall a. TextEncoding -> FilePath -> (CString -> IO a) -> IO a
GHC.withCString TextEncoding
enc FilePath
fp CString -> IO a
f

{- Encodes a FilePath into a String, applying the filesystem encoding.
 -
 - There are very few things it makes sense to do with such an encoded
 - string. It's not a legal filename; it should not be displayed.
 - So this function is not exported, but instead used by the few functions
 - that can usefully consume it.
 -
 - This use of unsafePerformIO is belived to be safe; GHC's interface
 - only allows doing this conversion with CStrings, and the CString buffer
 - is allocated, used, and deallocated within the call, with no side
 - effects.
 -
 - If the FilePath contains a value that is not legal in the filesystem
 - encoding, rather than thowing an exception, it will be returned as-is.
 -}
{-# NOINLINE _encodeFilePath #-}
_encodeFilePath :: FilePath -> String
_encodeFilePath :: FilePath -> FilePath
_encodeFilePath FilePath
fp = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
	TextEncoding
enc <- IO TextEncoding
Encoding.getFileSystemEncoding
	forall a. TextEncoding -> FilePath -> (CString -> IO a) -> IO a
GHC.withCString TextEncoding
enc FilePath
fp (TextEncoding -> CString -> IO FilePath
GHC.peekCString TextEncoding
Encoding.char8)
		forall (m :: * -> *) a.
MonadCatch m =>
m a -> (SomeException -> m a) -> m a
`catchNonAsync` (\SomeException
_ -> forall (m :: * -> *) a. Monad m => a -> m a
return FilePath
fp)

{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
decodeBS :: L.ByteString -> FilePath
#ifndef mingw32_HOST_OS
decodeBS :: ByteString -> FilePath
decodeBS = [Word8] -> FilePath
encodeW8NUL forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Word8]
L.unpack
#else
{- On Windows, we assume that the ByteString is utf-8, since Windows
 - only uses unicode for filenames. -}
decodeBS = L8.toString
#endif

{- Encodes a FilePath into a ByteString, applying the filesystem encoding. -}
encodeBS :: FilePath -> L.ByteString
#ifndef mingw32_HOST_OS
encodeBS :: FilePath -> ByteString
encodeBS = [Word8] -> ByteString
L.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. FilePath -> [Word8]
decodeW8NUL
#else
encodeBS = L8.fromString
#endif

{- Recent versions of the unix package have this alias; defined here
 - for backwards compatibility. -}
type RawFilePath = S.ByteString

{- Note that the RawFilePath is assumed to never contain NUL,
 - since filename's don't. This should only be used with actual
 - RawFilePaths not arbitrary ByteString that may contain NUL. -}
fromRawFilePath :: RawFilePath -> FilePath
fromRawFilePath :: RawFilePath -> FilePath
fromRawFilePath = [Word8] -> FilePath
encodeW8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. RawFilePath -> [Word8]
S.unpack

{- Note that the FilePath is assumed to never contain NUL,
 - since filename's don't. This should only be used with actual FilePaths
 - not arbitrary String that may contain NUL. -}
toRawFilePath :: FilePath -> RawFilePath
toRawFilePath :: FilePath -> RawFilePath
toRawFilePath = [Word8] -> RawFilePath
S.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. FilePath -> [Word8]
decodeW8

{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
 -
 - w82c produces a String, which may contain Chars that are invalid
 - unicode. From there, this is really a simple matter of applying the
 - file system encoding, only complicated by GHC's interface to doing so.
 -
 - Note that the encoding stops at any NUL in the input. FilePaths
 - do not normally contain embedded NUL, but Haskell Strings may.
 -}
{-# NOINLINE encodeW8 #-}
encodeW8 :: [Word8] -> FilePath
encodeW8 :: [Word8] -> FilePath
encodeW8 [Word8]
w8 = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
	TextEncoding
enc <- IO TextEncoding
Encoding.getFileSystemEncoding
	forall a. TextEncoding -> FilePath -> (CString -> IO a) -> IO a
GHC.withCString TextEncoding
Encoding.char8 ([Word8] -> FilePath
w82s [Word8]
w8) forall a b. (a -> b) -> a -> b
$ TextEncoding -> CString -> IO FilePath
GHC.peekCString TextEncoding
enc

{- Useful when you want the actual number of bytes that will be used to
 - represent the FilePath on disk. -}
decodeW8 :: FilePath -> [Word8]
decodeW8 :: FilePath -> [Word8]
decodeW8 = FilePath -> [Word8]
s2w8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. FilePath -> FilePath
_encodeFilePath

{- Like encodeW8 and decodeW8, but NULs are passed through unchanged. -}
encodeW8NUL :: [Word8] -> FilePath
encodeW8NUL :: [Word8] -> FilePath
encodeW8NUL = forall a. [a] -> [[a]] -> [a]
intercalate [Char
nul] forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map [Word8] -> FilePath
encodeW8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall c. Eq c => c -> [c] -> [[c]]
splitc (Char -> Word8
c2w8 Char
nul)
  where
	nul :: Char
nul = Char
'\NUL'

decodeW8NUL :: FilePath -> [Word8]
decodeW8NUL :: FilePath -> [Word8]
decodeW8NUL = forall a. [a] -> [[a]] -> [a]
intercalate [Char -> Word8
c2w8 Char
nul] forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map FilePath -> [Word8]
decodeW8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall c. Eq c => c -> [c] -> [[c]]
splitc Char
nul
  where
	nul :: Char
nul = Char
'\NUL'

c2w8 :: Char -> Word8
c2w8 :: Char -> Word8
c2w8 = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Enum a => a -> Int
fromEnum

w82c :: Word8 -> Char
w82c :: Word8 -> Char
w82c = forall a. Enum a => Int -> a
toEnum forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral

s2w8 :: String -> [Word8]
s2w8 :: FilePath -> [Word8]
s2w8 = forall a b. (a -> b) -> [a] -> [b]
map Char -> Word8
c2w8

w82s :: [Word8] -> String
w82s :: [Word8] -> FilePath
w82s = forall a b. (a -> b) -> [a] -> [b]
map Word8 -> Char
w82c

{- Truncates a FilePath to the given number of bytes (or less),
 - as represented on disk.
 -
 - Avoids returning an invalid part of a unicode byte sequence, at the
 - cost of efficiency when running on a large FilePath.
 -}
truncateFilePath :: Int -> FilePath -> FilePath
#ifndef mingw32_HOST_OS
truncateFilePath :: Int -> FilePath -> FilePath
truncateFilePath Int
n = FilePath -> FilePath
go forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. [a] -> [a]
reverse
  where
	go :: FilePath -> FilePath
go FilePath
f =
		let bytes :: [Word8]
bytes = FilePath -> [Word8]
decodeW8 FilePath
f
		in if forall (t :: * -> *) a. Foldable t => t a -> Int
length [Word8]
bytes forall a. Ord a => a -> a -> Bool
<= Int
n
			then forall a. [a] -> [a]
reverse FilePath
f
			else FilePath -> FilePath
go (forall a. Int -> [a] -> [a]
drop Int
1 FilePath
f)
#else
{- On Windows, count the number of bytes used by each utf8 character. -}
truncateFilePath n = reverse . go [] n . L8.fromString
  where
	go coll cnt bs
		| cnt <= 0 = coll
		| otherwise = case L8.decode bs of
			Just (c, x) | c /= L8.replacement_char ->
				let x' = fromIntegral x
				in if cnt - x' < 0
					then coll
					else go (c:coll) (cnt - x') (L8.drop 1 bs)
			_ -> coll
#endif