{-# LANGUAGE CPP #-} {-# LANGUAGE NoImplicitPrelude #-} {- | Module : System.Win32.Encoding Copyright : 2012 shelarcy License : BSD-style Maintainer : shelarcy@gmail.com Stability : Provisional Portability : Non-portable (Win32 API) Enocode/Decode mutibyte character using Win32 API. -} module GHC.IO.Windows.Encoding ( encodeMultiByte , encodeMultiByteIO , encodeMultiByteRawIO , decodeMultiByte , decodeMultiByteIO , wideCharToMultiByte , multiByteToWideChar , withGhcInternalToUTF16 , withUTF16ToGhcInternal ) where import Data.Word (Word8, Word16) import Foreign.C.Types (CInt(..)) import Foreign.C.String (peekCAStringLen, peekCWStringLen, withCWStringLen, withCAStringLen, ) import Foreign.Ptr (nullPtr, Ptr ()) import Foreign.Marshal.Array (allocaArray) import Foreign.Marshal.Unsafe (unsafeLocalState) import GHC.Windows import GHC.IO.Encoding.CodePage (CodePage, getCurrentCodePage) import GHC.IO import GHC.Base import GHC.Real #include "windows_cconv.h" -- | The "System.IO" output functions (e.g. `putStr`) don't -- automatically convert to multibyte string on Windows, so this -- function is provided to make the conversion from a Unicode string -- in the given code page to a proper multibyte string. To get the -- code page for the console, use `getCurrentCodePage`. -- encodeMultiByte :: CodePage -> String -> String encodeMultiByte cp = unsafeLocalState . encodeMultiByteIO cp {-# INLINE encodeMultiByteIO' #-} -- | String must not be zero length. encodeMultiByteIO' :: CodePage -> String -> ((LPCSTR, CInt) -> IO a) -> IO a encodeMultiByteIO' cp wstr transformer = withCWStringLen wstr $ \(cwstr,len) -> do mbchars' <- failIfZero "WideCharToMultiByte" $ wideCharToMultiByte cp 0 cwstr (fromIntegral len) nullPtr 0 nullPtr nullPtr -- mbchar' is the length of buffer required allocaArray (fromIntegral mbchars') $ \mbstr -> do mbchars <- failIfZero "WideCharToMultiByte" $ wideCharToMultiByte cp 0 cwstr (fromIntegral len) mbstr mbchars' nullPtr nullPtr transformer (mbstr,fromIntegral mbchars) -- converts [Char] to UTF-16 encodeMultiByteIO :: CodePage -> String -> IO String encodeMultiByteIO _ "" = return "" encodeMultiByteIO cp s = encodeMultiByteIO' cp s toString where toString (st,l) = peekCAStringLen (st,fromIntegral l) -- converts [Char] to UTF-16 encodeMultiByteRawIO :: CodePage -> String -> IO (LPCSTR, CInt) encodeMultiByteRawIO _ "" = return (nullPtr, 0) encodeMultiByteRawIO cp s = encodeMultiByteIO' cp s toSizedCString where toSizedCString (st,l) = return (st, fromIntegral l) foreign import WINDOWS_CCONV "WideCharToMultiByte" wideCharToMultiByte :: CodePage -> DWORD -- dwFlags, -> LPCWSTR -- lpWideCharStr -> CInt -- cchWideChar -> LPSTR -- lpMultiByteStr -> CInt -- cbMultiByte -> LPCSTR -- lpMultiByteStr -> LPBOOL -- lpbFlags -> IO CInt -- | The `System.IO` input functions (e.g. `getLine`) don't -- automatically convert to Unicode, so this function is provided to -- make the conversion from a multibyte string in the given code page -- to a proper Unicode string. To get the code page for the console, -- use `getConsoleCP`. stringToUnicode :: CodePage -> String -> IO String stringToUnicode _cp "" = return "" -- MultiByteToWideChar doesn't handle empty strings (#1929) stringToUnicode cp mbstr = withCAStringLen mbstr $ \(cstr,len) -> do wchars <- failIfZero "MultiByteToWideChar" $ multiByteToWideChar cp 0 cstr (fromIntegral len) nullPtr 0 -- wchars is the length of buffer required allocaArray (fromIntegral wchars) $ \cwstr -> do wchars' <- failIfZero "MultiByteToWideChar" $ multiByteToWideChar cp 0 cstr (fromIntegral len) cwstr wchars peekCWStringLen (cwstr,fromIntegral wchars') -- converts UTF-16 to [Char] foreign import WINDOWS_CCONV unsafe "MultiByteToWideChar" multiByteToWideChar :: CodePage -> DWORD -- dwFlags, -> LPCSTR -- lpMultiByteStr -> CInt -- cbMultiByte -> LPWSTR -- lpWideCharStr -> CInt -- cchWideChar -> IO CInt decodeMultiByte :: CodePage -> String -> String decodeMultiByte cp = unsafeLocalState . decodeMultiByteIO cp -- | Because of `stringToUnicode` is unclear name, we use `decodeMultiByteIO` -- for alias of `stringToUnicode`. decodeMultiByteIO :: CodePage -> String -> IO String decodeMultiByteIO = stringToUnicode {-# INLINE decodeMultiByteIO #-} foreign import WINDOWS_CCONV unsafe "MultiByteToWideChar" multiByteToWideChar' :: CodePage -> DWORD -- dwFlags, -> Ptr Word8 -- lpMultiByteStr -> CInt -- cbMultiByte -> Ptr Word16 -- lpWideCharStr -> CInt -- cchWideChar -> IO CInt -- TODO: GHC is internally UTF-32 which means we have re-encode for -- Windows which is annoying. Switch to UTF-16 on IoNative -- being default. withGhcInternalToUTF16 :: Ptr Word8 -> Int -> ((Ptr Word16, CInt) -> IO a) -> IO a withGhcInternalToUTF16 ptr len fn = do cp <- getCurrentCodePage wchars <- failIfZero "withGhcInternalToUTF16" $ multiByteToWideChar' cp 0 ptr (fromIntegral len) nullPtr 0 -- wchars is the length of buffer required allocaArray (fromIntegral wchars) $ \cwstr -> do wchars' <- failIfZero "withGhcInternalToUTF16" $ multiByteToWideChar' cp 0 ptr (fromIntegral len) cwstr wchars fn (cwstr, wchars') foreign import WINDOWS_CCONV "WideCharToMultiByte" wideCharToMultiByte' :: CodePage -> DWORD -- dwFlags, -> Ptr Word16 -- lpWideCharStr -> CInt -- cchWideChar -> Ptr Word8 -- lpMultiByteStr -> CInt -- cbMultiByte -> LPCSTR -- lpMultiByteStr -> LPBOOL -- lpbFlags -> IO CInt -- TODO: GHC is internally UTF-32 which means we have re-encode for -- Windows which is annoying. Switch to UTF-16 on IoNative -- being default. -- | Decode a UTF16 buffer into the given buffer in the current code page. -- The source UTF16 buffer is filled by the function given as argument. withUTF16ToGhcInternal :: Ptr Word8 -- Buffer to store the encoded string in. -> Int -- Length of the buffer -- Function to fill source buffer. -> ( CInt -- Size of available buffer in bytes -> Ptr Word16 -- Temporary source buffer. -> IO CInt -- Actual length of buffer content. ) -> IO Int -- Returns number of bytes stored in buffer. withUTF16ToGhcInternal ptr len fn = do cp <- getCurrentCodePage -- Annoyingly the IO system is very UTF-32 oriented and asks for bytes -- as buffer reads. Problem is we don't know how many bytes we'll end up -- having as UTF-32 MultiByte encoded UTF-16. So be conservative. We assume -- that a single byte may expand to atmost 1 Word16. So assume that each -- byte does and divide the requested number of bytes by two since each -- Word16 encoded wchar may expand to only two Word8 sequences. let reqBytes = fromIntegral (len `div` 2) allocaArray reqBytes $ \w_ptr -> do w_len <- fn (fromIntegral reqBytes) w_ptr if w_len == 0 then return 0 else do -- Get required length of encoding mbchars' <- failIfZero "withUTF16ToGhcInternal" $ wideCharToMultiByte' cp 0 w_ptr (fromIntegral w_len) nullPtr 0 nullPtr nullPtr assert (mbchars' <= (fromIntegral len)) $ do -- mbchar' is the length of buffer required mbchars <- failIfZero "withUTF16ToGhcInternal" $ wideCharToMultiByte' cp 0 w_ptr (fromIntegral w_len) ptr mbchars' nullPtr nullPtr return $ fromIntegral mbchars