{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash,
UnliftedFFITypes #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
module Data.Text.Encoding
(
decodeASCII
, decodeLatin1
, decodeUtf8
, decodeUtf16LE
, decodeUtf16BE
, decodeUtf32LE
, decodeUtf32BE
, decodeUtf8'
, decodeUtf8With
, decodeUtf16LEWith
, decodeUtf16BEWith
, decodeUtf32LEWith
, decodeUtf32BEWith
, streamDecodeUtf8
, streamDecodeUtf8With
, Decoding(..)
, encodeUtf8
, encodeUtf16LE
, encodeUtf16BE
, encodeUtf32LE
, encodeUtf32BE
, encodeUtf8Builder
, encodeUtf8BuilderEscaped
) where
#if __GLASGOW_HASKELL__ >= 702
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
#else
import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
#endif
import Control.Exception (evaluate, try)
import Control.Monad.ST (runST)
import Data.ByteString as B
import Data.ByteString.Internal as B hiding (c2w)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show ()
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8, Word32)
#if __GLASGOW_HASKELL__ >= 703
import Foreign.C.Types (CSize)
#else
import Foreign.C.Types (CSize)
#endif
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
import GHC.Base (MutableByteArray#)
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Builder.Prim.Internal as BP
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Fusion as E
import qualified Data.Text.Internal.Fusion as F
#include "text_cbits.h"
decodeASCII :: ByteString -> Text
decodeASCII = decodeUtf8
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
decodeLatin1 :: ByteString -> Text
decodeLatin1 s = F.unstream (E.streamASCII s)
decodeUtf8With :: OnDecodeError -> ByteString -> Text
decodeUtf8With onErr s@(PS fp off len) = runST $ do
dest <- A.new len
unsafeIOToST $ do
withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->do
let curPtr = ptr `plusPtr` off
let end = ptr `plusPtr` (off + len)
curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
if curPtr' == end
then do
n <- peek destOffPtr
dest' <- unsafeSTToIO (A.unsafeFreeze dest)
return (Text dest' 0 (fromIntegral n))
else do
return (F.unstream (E.streamUtf8 onErr s))
data Decoding = Some Text ByteString (ByteString -> Decoding)
instance Show Decoding where
showsPrec d (Some t bs _) = showParen (d > prec) $
showString "Some " . showsPrec prec' t .
showChar ' ' . showsPrec prec' bs .
showString " _"
where prec = 10; prec' = prec + 1
newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = streamDecodeUtf8With strictDecode
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
where
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) =
runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->
with codepoint0 $ \codepointPtr ->
with state0 $ \statePtr ->
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` (off + len)
loop curPtr = do
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
_ -> do
n <- peek destOffPtr
codepoint <- peek codepointPtr
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop (ptr `plusPtr` off)
desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
decodeUtf8 :: ByteString -> Text
decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE[0] decodeUtf8 #-}
{-# RULES "STREAM stream/decodeUtf8 fusion" [1]
forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
decodeUtf8' :: ByteString -> Either UnicodeException Text
decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode
{-# INLINE decodeUtf8' #-}
encodeUtf8Builder :: Text -> B.Builder
encodeUtf8Builder = \t -> B.builder (textCopyStep t)
{-# INLINE encodeUtf8Builder #-}
textCopyStep :: Text -> B.BuildStep a -> B.BuildStep a
textCopyStep !(Text arr off len) k = go 0 len
where
go !ip !ipe !(B.BufferRange op ope)
| inpRemaining <= outRemaining = do
A.copyToPtr op 0 arr (off + ip) inpRemaining
let !br' = B.BufferRange (op `plusPtr` inpRemaining) ope
k br'
| otherwise = do
A.copyToPtr op 0 arr (off + ip) outRemaining
let !ip' = ip + outRemaining
return $ B.bufferFull 1 ope (go ip' ipe)
where
outRemaining = ope `minusPtr` op
inpRemaining = ipe - ip
{-# INLINE textCopyStep #-}
{-# INLINE encodeUtf8BuilderEscaped #-}
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped be =
\txt -> B.builder (mkBuildstep txt)
where
bound = max 4 $ BP.sizeBound be
mkBuildstep (Text arr off len) !k =
outerLoop off
where
iend = off + len
outerLoop !i0 !br@(B.BufferRange op0 ope)
| i0 >= iend = k br
| outRemaining > 0 = goPartial (i0 + min outRemaining inpRemaining)
| otherwise = return $ B.bufferFull bound op0 (outerLoop i0)
where
outRemaining = (ope `minusPtr` op0) `div` bound
inpRemaining = iend - i0
goPartial !iendTmp = go i0 op0
where
go !i !op
| i < iendTmp = case () of
_ | a <= 0x7F ->
BP.runB be (fromIntegral a) op >>= go (i + 1)
| 0xC2 <= a && a <= 0xDF -> do
poke8 0 a
poke8 1 b
go (i + 2) (op `plusPtr` 2)
| 0xE0 <= a && a <= 0xEF -> do
poke8 0 a
poke8 1 b
poke8 2 c
go (i + 3) (op `plusPtr` 3)
| otherwise -> do
poke8 0 a
poke8 1 b
poke8 2 c
poke8 3 d
go (i + 4) (op `plusPtr` 4)
| otherwise =
outerLoop i (B.BufferRange op ope)
where
poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
a = A.unsafeIndex arr i
b = A.unsafeIndex arr (i+1)
c = A.unsafeIndex arr (i+2)
d = A.unsafeIndex arr (i+3)
encodeUtf8 :: Text -> ByteString
encodeUtf8 (Text arr off len)
| len == 0 = B.empty
| otherwise = B.unsafeCreate len (\op -> A.copyToPtr op 0 arr off len)
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
{-# INLINE decodeUtf16LEWith #-}
decodeUtf16LE :: ByteString -> Text
decodeUtf16LE = decodeUtf16LEWith strictDecode
{-# INLINE decodeUtf16LE #-}
decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
{-# INLINE decodeUtf16BEWith #-}
decodeUtf16BE :: ByteString -> Text
decodeUtf16BE = decodeUtf16BEWith strictDecode
{-# INLINE decodeUtf16BE #-}
encodeUtf16LE :: Text -> ByteString
encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
{-# INLINE encodeUtf16LE #-}
encodeUtf16BE :: Text -> ByteString
encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
{-# INLINE encodeUtf16BE #-}
decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
{-# INLINE decodeUtf32LEWith #-}
decodeUtf32LE :: ByteString -> Text
decodeUtf32LE = decodeUtf32LEWith strictDecode
{-# INLINE decodeUtf32LE #-}
decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
{-# INLINE decodeUtf32BEWith #-}
decodeUtf32BE :: ByteString -> Text
decodeUtf32BE = decodeUtf32BEWith strictDecode
{-# INLINE decodeUtf32BE #-}
encodeUtf32LE :: Text -> ByteString
encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
{-# INLINE encodeUtf32LE #-}
encodeUtf32BE :: Text -> ByteString
encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
{-# INLINE encodeUtf32BE #-}
foreign import ccall unsafe "_hs_text_utf_8_decode_utf8" c_decode_utf8
:: MutableByteArray# s -> Ptr CSize
-> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
foreign import ccall unsafe "_hs_text_utf_8_decode_utf8_state" c_decode_utf8_with_state
:: MutableByteArray# s -> Ptr CSize
-> Ptr (Ptr Word8) -> Ptr Word8
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)