{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE RecordWildCards #-}
#include "inline.hs"
module Streamly.Internal.Data.Unicode.Stream
(
decodeLatin1
, decodeUtf8
, decodeUtf8Lax
, DecodeError(..)
, DecodeState
, CodePoint
, decodeUtf8Either
, resumeDecodeUtf8Either
, decodeUtf8Arrays
, decodeUtf8ArraysLenient
, encodeLatin1
, encodeLatin1Lax
, encodeUtf8
, decodeUtf8D
, encodeUtf8D
, decodeUtf8LenientD
, decodeUtf8EitherD
, resumeDecodeUtf8EitherD
, decodeUtf8ArraysD
, decodeUtf8ArraysLenientD
, stripStart
, lines
, words
, unlines
, unwords
)
where
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Bits (shiftR, shiftL, (.|.), (.&.))
import Data.Char (ord)
import Data.Word (Word8)
import Foreign.ForeignPtr (touchForeignPtr)
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import Foreign.Storable (Storable(..))
import GHC.Base (assert, unsafeChr)
import GHC.ForeignPtr (ForeignPtr (..))
import GHC.IO.Encoding.Failure (isSurrogate)
import GHC.Ptr (Ptr (..), plusPtr)
import Prelude hiding (String, lines, words, unlines, unwords)
import System.IO.Unsafe (unsafePerformIO)
import Streamly (IsStream)
import Streamly.Data.Fold (Fold)
import Streamly.Memory.Array (Array)
import Streamly.Internal.Data.Unfold (Unfold)
import Streamly.Internal.Data.SVar (adaptState)
import Streamly.Internal.Data.Stream.StreamD (Stream(..), Step (..))
import Streamly.Internal.Data.Strict (Tuple'(..))
#if __GLASGOW_HASKELL__ < 800
import Streamly.Internal.Data.Stream.StreamD (pattern Stream)
#endif
import qualified Streamly.Internal.Memory.Array.Types as A
import qualified Streamly.Internal.Prelude as S
import qualified Streamly.Internal.Data.Stream.StreamD as D
data WList = WCons !Word8 !WList | WNil
{-# INLINE ord2 #-}
ord2 :: Char -> WList
ord2 c = assert (n >= 0x80 && n <= 0x07ff) (WCons x1 (WCons x2 WNil))
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
x2 = fromIntegral $ (n .&. 0x3F) + 0x80
{-# INLINE ord3 #-}
ord3 :: Char -> WList
ord3 c = assert (n >= 0x0800 && n <= 0xffff) (WCons x1 (WCons x2 (WCons x3 WNil)))
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x3 = fromIntegral $ (n .&. 0x3F) + 0x80
{-# INLINE ord4 #-}
ord4 :: Char -> WList
ord4 c = assert (n >= 0x10000) (WCons x1 (WCons x2 (WCons x3 (WCons x4 WNil))))
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (n .&. 0x3F) + 0x80
data CodingFailureMode
= TransliterateCodingFailure
| ErrorOnCodingFailure
deriving (Show)
{-# INLINE replacementChar #-}
replacementChar :: Char
replacementChar = '\xFFFD'
type CodePoint = Int
type DecodeState = Word8
decodeTable :: [Word8]
decodeTable = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
]
utf8d :: A.Array Word8
utf8d =
unsafePerformIO
$ D.runFold (A.writeNAlignedUnmanaged 64 (length decodeTable))
(D.fromList decodeTable)
{-# INLINE_NORMAL unsafePeekElemOff #-}
unsafePeekElemOff :: forall a. Storable a => Ptr a -> Int -> a
unsafePeekElemOff p i = let !x = A.unsafeInlineIO $ peekElemOff p i in x
{-# INLINE decode0 #-}
decode0 :: Ptr Word8 -> Word8 -> Tuple' DecodeState CodePoint
decode0 table byte =
let !t = table `unsafePeekElemOff` fromIntegral byte
!codep' = (0xff `shiftR` (fromIntegral t)) .&. fromIntegral byte
!state' = table `unsafePeekElemOff` (256 + fromIntegral t)
in assert ((byte > 0x7f || error showByte)
&& (state' /= 0 || error (showByte ++ showTable)))
(Tuple' state' codep')
where
utf8table =
let !(Ptr addr) = table
end = table `plusPtr` 364
in A.Array (ForeignPtr addr undefined) end end :: A.Array Word8
showByte = "Streamly: decode0: byte: " ++ show byte
showTable = " table: " ++ show utf8table
{-# INLINE decode1 #-}
decode1
:: Ptr Word8
-> DecodeState
-> CodePoint
-> Word8
-> Tuple' DecodeState CodePoint
decode1 table state codep byte =
let !t = table `unsafePeekElemOff` fromIntegral byte
!codep' = (fromIntegral byte .&. 0x3f) .|. (codep `shiftL` 6)
!state' = table `unsafePeekElemOff`
(256 + fromIntegral state + fromIntegral t)
in assert (codep' <= 0x10FFFF
|| error (showByte ++ showState state codep))
(Tuple' state' codep')
where
utf8table =
let !(Ptr addr) = table
end = table `plusPtr` 364
in A.Array (ForeignPtr addr undefined) end end :: A.Array Word8
showByte = "Streamly: decode1: byte: " ++ show byte
showState st cp =
" state: " ++ show st ++
" codepoint: " ++ show cp ++
" table: " ++ show utf8table
data DecodeError = DecodeError !DecodeState !CodePoint deriving Show
data FreshPoint s a
= FreshPointDecodeInit s
| FreshPointDecodeInit1 s Word8
| FreshPointDecodeFirst s Word8
| FreshPointDecoding s !DecodeState !CodePoint
| YieldAndContinue a (FreshPoint s a)
| Done
{-# INLINE_NORMAL decodeUtf8WithD #-}
decodeUtf8WithD :: Monad m => CodingFailureMode -> Stream m Word8 -> Stream m Char
decodeUtf8WithD cfm (Stream step state) =
let A.Array p _ _ = utf8d
!ptr = (unsafeForeignPtrToPtr p)
in Stream (step' ptr) (FreshPointDecodeInit state)
where
{-# INLINE transliterateOrError #-}
transliterateOrError e s =
case cfm of
ErrorOnCodingFailure -> error e
TransliterateCodingFailure -> YieldAndContinue replacementChar s
{-# INLINE inputUnderflow #-}
inputUnderflow =
case cfm of
ErrorOnCodingFailure ->
error "Streamly.Internal.Data.Stream.StreamD.decodeUtf8With: Input Underflow"
TransliterateCodingFailure -> YieldAndContinue replacementChar Done
{-# INLINE_LATE step' #-}
step' _ gst (FreshPointDecodeInit st) = do
r <- step (adaptState gst) st
return $ case r of
Yield x s -> Skip (FreshPointDecodeInit1 s x)
Skip s -> Skip (FreshPointDecodeInit s)
Stop -> Skip Done
step' _ _ (FreshPointDecodeInit1 st x) = do
case x > 0x7f of
False ->
return $ Skip $ YieldAndContinue
(unsafeChr (fromIntegral x))
(FreshPointDecodeInit st)
True -> return $ Skip $ FreshPointDecodeFirst st x
step' table _ (FreshPointDecodeFirst st x) = do
let (Tuple' sv cp) = decode0 table x
return $
case sv of
12 ->
Skip $
transliterateOrError
"Streamly.Internal.Data.Stream.StreamD.decodeUtf8With: Invalid UTF8 codepoint encountered"
(FreshPointDecodeInit st)
0 -> error "unreachable state"
_ -> Skip (FreshPointDecoding st sv cp)
step' table gst (FreshPointDecoding st statePtr codepointPtr) = do
r <- step (adaptState gst) st
case r of
Yield x s -> do
let (Tuple' sv cp) = decode1 table statePtr codepointPtr x
return $
case sv of
0 -> Skip $ YieldAndContinue (unsafeChr cp)
(FreshPointDecodeInit s)
12 ->
Skip $
transliterateOrError
"Streamly.Internal.Data.Stream.StreamD.decodeUtf8With: Invalid UTF8 codepoint encountered"
(FreshPointDecodeInit1 s x)
_ -> Skip (FreshPointDecoding s sv cp)
Skip s -> return $ Skip (FreshPointDecoding s statePtr codepointPtr)
Stop -> return $ Skip inputUnderflow
step' _ _ (YieldAndContinue c s) = return $ Yield c s
step' _ _ Done = return Stop
{-# INLINE decodeUtf8D #-}
decodeUtf8D :: Monad m => Stream m Word8 -> Stream m Char
decodeUtf8D = decodeUtf8WithD ErrorOnCodingFailure
{-# INLINE decodeUtf8LenientD #-}
decodeUtf8LenientD :: Monad m => Stream m Word8 -> Stream m Char
decodeUtf8LenientD = decodeUtf8WithD TransliterateCodingFailure
{-# INLINE_NORMAL resumeDecodeUtf8EitherD #-}
resumeDecodeUtf8EitherD
:: Monad m
=> DecodeState
-> CodePoint
-> Stream m Word8
-> Stream m (Either DecodeError Char)
resumeDecodeUtf8EitherD dst codep (Stream step state) =
let A.Array p _ _ = utf8d
!ptr = (unsafeForeignPtrToPtr p)
stt =
if dst == 0
then FreshPointDecodeInit state
else FreshPointDecoding state dst codep
in Stream (step' ptr) stt
where
{-# INLINE_LATE step' #-}
step' _ gst (FreshPointDecodeInit st) = do
r <- step (adaptState gst) st
return $ case r of
Yield x s -> Skip (FreshPointDecodeInit1 s x)
Skip s -> Skip (FreshPointDecodeInit s)
Stop -> Skip Done
step' _ _ (FreshPointDecodeInit1 st x) = do
case x > 0x7f of
False ->
return $ Skip $ YieldAndContinue
(Right $ unsafeChr (fromIntegral x))
(FreshPointDecodeInit st)
True -> return $ Skip $ FreshPointDecodeFirst st x
step' table _ (FreshPointDecodeFirst st x) = do
let (Tuple' sv cp) = decode0 table x
return $
case sv of
12 ->
Skip $ YieldAndContinue (Left $ DecodeError 0 (fromIntegral x))
(FreshPointDecodeInit st)
0 -> error "unreachable state"
_ -> Skip (FreshPointDecoding st sv cp)
step' table gst (FreshPointDecoding st statePtr codepointPtr) = do
r <- step (adaptState gst) st
case r of
Yield x s -> do
let (Tuple' sv cp) = decode1 table statePtr codepointPtr x
return $
case sv of
0 -> Skip $ YieldAndContinue (Right $ unsafeChr cp)
(FreshPointDecodeInit s)
12 ->
Skip $ YieldAndContinue (Left $ DecodeError statePtr codepointPtr)
(FreshPointDecodeInit1 s x)
_ -> Skip (FreshPointDecoding s sv cp)
Skip s -> return $ Skip (FreshPointDecoding s statePtr codepointPtr)
Stop -> return $ Skip $ YieldAndContinue (Left $ DecodeError statePtr codepointPtr) Done
step' _ _ (YieldAndContinue c s) = return $ Yield c s
step' _ _ Done = return Stop
{-# INLINE_NORMAL decodeUtf8EitherD #-}
decodeUtf8EitherD :: Monad m
=> Stream m Word8 -> Stream m (Either DecodeError Char)
decodeUtf8EitherD = resumeDecodeUtf8EitherD 0 0
data FlattenState s a
= OuterLoop s !(Maybe (DecodeState, CodePoint))
| InnerLoopDecodeInit s (ForeignPtr a) !(Ptr a) !(Ptr a)
| InnerLoopDecodeFirst s (ForeignPtr a) !(Ptr a) !(Ptr a) Word8
| InnerLoopDecoding s (ForeignPtr a) !(Ptr a) !(Ptr a)
!DecodeState !CodePoint
| YAndC !Char (FlattenState s a)
| D
{-# INLINE_NORMAL decodeUtf8ArraysWithD #-}
decodeUtf8ArraysWithD ::
MonadIO m
=> CodingFailureMode
-> Stream m (A.Array Word8)
-> Stream m Char
decodeUtf8ArraysWithD cfm (Stream step state) =
let A.Array p _ _ = utf8d
!ptr = (unsafeForeignPtrToPtr p)
in Stream (step' ptr) (OuterLoop state Nothing)
where
{-# INLINE transliterateOrError #-}
transliterateOrError e s =
case cfm of
ErrorOnCodingFailure -> error e
TransliterateCodingFailure -> YAndC replacementChar s
{-# INLINE inputUnderflow #-}
inputUnderflow =
case cfm of
ErrorOnCodingFailure ->
error
"Streamly.Internal.Data.Stream.StreamD.decodeUtf8ArraysWith: Input Underflow"
TransliterateCodingFailure -> YAndC replacementChar D
{-# INLINE_LATE step' #-}
step' _ gst (OuterLoop st Nothing) = do
r <- step (adaptState gst) st
return $
case r of
Yield A.Array {..} s ->
let p = unsafeForeignPtrToPtr aStart
in Skip (InnerLoopDecodeInit s aStart p aEnd)
Skip s -> Skip (OuterLoop s Nothing)
Stop -> Skip D
step' _ gst (OuterLoop st dst@(Just (ds, cp))) = do
r <- step (adaptState gst) st
return $
case r of
Yield A.Array {..} s ->
let p = unsafeForeignPtrToPtr aStart
in Skip (InnerLoopDecoding s aStart p aEnd ds cp)
Skip s -> Skip (OuterLoop s dst)
Stop -> Skip inputUnderflow
step' _ _ (InnerLoopDecodeInit st startf p end)
| p == end = do
liftIO $ touchForeignPtr startf
return $ Skip $ OuterLoop st Nothing
step' _ _ (InnerLoopDecodeInit st startf p end) = do
x <- liftIO $ peek p
case x > 0x7f of
False ->
return $ Skip $ YAndC
(unsafeChr (fromIntegral x))
(InnerLoopDecodeInit st startf (p `plusPtr` 1) end)
True -> return $ Skip $ InnerLoopDecodeFirst st startf p end x
step' table _ (InnerLoopDecodeFirst st startf p end x) = do
let (Tuple' sv cp) = decode0 table x
return $
case sv of
12 ->
Skip $
transliterateOrError
"Streamly.Internal.Data.Stream.StreamD.decodeUtf8ArraysWith: Invalid UTF8 codepoint encountered"
(InnerLoopDecodeInit st startf (p `plusPtr` 1) end)
0 -> error "unreachable state"
_ -> Skip (InnerLoopDecoding st startf (p `plusPtr` 1) end sv cp)
step' _ _ (InnerLoopDecoding st startf p end sv cp)
| p == end = do
liftIO $ touchForeignPtr startf
return $ Skip $ OuterLoop st (Just (sv, cp))
step' table _ (InnerLoopDecoding st startf p end statePtr codepointPtr) = do
x <- liftIO $ peek p
let (Tuple' sv cp) = decode1 table statePtr codepointPtr x
return $
case sv of
0 ->
Skip $
YAndC
(unsafeChr cp)
(InnerLoopDecodeInit st startf (p `plusPtr` 1) end)
12 ->
Skip $
transliterateOrError
"Streamly.Internal.Data.Stream.StreamD.decodeUtf8ArraysWith: Invalid UTF8 codepoint encountered"
(InnerLoopDecodeInit st startf (p `plusPtr` 1) end)
_ -> Skip (InnerLoopDecoding st startf (p `plusPtr` 1) end sv cp)
step' _ _ (YAndC c s) = return $ Yield c s
step' _ _ D = return Stop
{-# INLINE decodeUtf8ArraysD #-}
decodeUtf8ArraysD ::
MonadIO m
=> Stream m (A.Array Word8)
-> Stream m Char
decodeUtf8ArraysD = decodeUtf8ArraysWithD ErrorOnCodingFailure
{-# INLINE decodeUtf8ArraysLenientD #-}
decodeUtf8ArraysLenientD ::
MonadIO m
=> Stream m (A.Array Word8)
-> Stream m Char
decodeUtf8ArraysLenientD = decodeUtf8ArraysWithD TransliterateCodingFailure
data EncodeState s = EncodeState s !WList
{-# INLINE_NORMAL encodeUtf8D #-}
encodeUtf8D :: Monad m => Stream m Char -> Stream m Word8
encodeUtf8D (Stream step state) = Stream step' (EncodeState state WNil)
where
{-# INLINE_LATE step' #-}
step' gst (EncodeState st WNil) = do
r <- step (adaptState gst) st
return $
case r of
Yield c s ->
case ord c of
x
| x <= 0x7F ->
Yield (fromIntegral x) (EncodeState s WNil)
| x <= 0x7FF -> Skip (EncodeState s (ord2 c))
| x <= 0xFFFF ->
if isSurrogate c
then error
"Streamly.Internal.Data.Stream.StreamD.encodeUtf8: Encountered a surrogate"
else Skip (EncodeState s (ord3 c))
| otherwise -> Skip (EncodeState s (ord4 c))
Skip s -> Skip (EncodeState s WNil)
Stop -> Stop
step' _ (EncodeState s (WCons x xs)) = return $ Yield x (EncodeState s xs)
{-# INLINE decodeLatin1 #-}
decodeLatin1 :: (IsStream t, Monad m) => t m Word8 -> t m Char
decodeLatin1 = S.map (unsafeChr . fromIntegral)
{-# INLINE encodeLatin1 #-}
encodeLatin1 :: (IsStream t, Monad m) => t m Char -> t m Word8
encodeLatin1 = S.map convert
where
convert c =
let codepoint = ord c
in if codepoint > 255
then error $ "Streamly.String.encodeLatin1 invalid " ++
"input char codepoint " ++ show codepoint
else fromIntegral codepoint
{-# INLINE encodeLatin1Lax #-}
encodeLatin1Lax :: (IsStream t, Monad m) => t m Char -> t m Word8
encodeLatin1Lax = S.map (fromIntegral . ord)
{-# INLINE decodeUtf8 #-}
decodeUtf8 :: (Monad m, IsStream t) => t m Word8 -> t m Char
decodeUtf8 = D.fromStreamD . decodeUtf8D . D.toStreamD
{-# INLINE decodeUtf8Arrays #-}
decodeUtf8Arrays :: (MonadIO m, IsStream t) => t m (Array Word8) -> t m Char
decodeUtf8Arrays = D.fromStreamD . decodeUtf8ArraysD . D.toStreamD
{-# INLINE decodeUtf8Lax #-}
decodeUtf8Lax :: (Monad m, IsStream t) => t m Word8 -> t m Char
decodeUtf8Lax = D.fromStreamD . decodeUtf8LenientD . D.toStreamD
{-# INLINE decodeUtf8Either #-}
decodeUtf8Either :: (Monad m, IsStream t)
=> t m Word8 -> t m (Either DecodeError Char)
decodeUtf8Either = D.fromStreamD . decodeUtf8EitherD . D.toStreamD
{-# INLINE resumeDecodeUtf8Either #-}
resumeDecodeUtf8Either
:: (Monad m, IsStream t)
=> DecodeState
-> CodePoint
-> t m Word8
-> t m (Either DecodeError Char)
resumeDecodeUtf8Either st cp =
D.fromStreamD . resumeDecodeUtf8EitherD st cp . D.toStreamD
{-# INLINE decodeUtf8ArraysLenient #-}
decodeUtf8ArraysLenient ::
(MonadIO m, IsStream t) => t m (Array Word8) -> t m Char
decodeUtf8ArraysLenient =
D.fromStreamD . decodeUtf8ArraysLenientD . D.toStreamD
{-# INLINE encodeUtf8 #-}
encodeUtf8 :: (Monad m, IsStream t) => t m Char -> t m Word8
encodeUtf8 = D.fromStreamD . encodeUtf8D . D.toStreamD
{-# INLINE stripStart #-}
stripStart :: (Monad m, IsStream t) => t m Char -> t m Char
stripStart = S.dropWhile isSpace
{-# INLINE lines #-}
lines :: (Monad m, IsStream t) => Fold m Char b -> t m Char -> t m b
lines = S.splitOnSuffix (== '\n')
foreign import ccall unsafe "u_iswspace"
iswspace :: Int -> Int
{-# INLINE isSpace #-}
isSpace :: Char -> Bool
isSpace c
| uc <= 0x377 = uc == 32 || uc - 0x9 <= 4 || uc == 0xa0
| otherwise = iswspace (ord c) /= 0
where
uc = fromIntegral (ord c) :: Word
{-# INLINE words #-}
words :: (Monad m, IsStream t) => Fold m Char b -> t m Char -> t m b
words = S.wordsBy isSpace
{-# INLINE unlines #-}
unlines :: (MonadIO m, IsStream t) => Unfold m a Char -> t m a -> t m Char
unlines = S.interposeSuffix '\n'
{-# INLINE unwords #-}
unwords :: (MonadIO m, IsStream t) => Unfold m a Char -> t m a -> t m Char
unwords = S.interpose ' '