{-# LANGUAGE CPP #-} {-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE UnliftedFFITypes #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MultiWayIf #-} module Data.JsonStream.Unescape ( unescapeText ) where import Data.ByteString as B import Data.ByteString.Internal as B hiding (c2w) import Data.Text.Encoding.Error (UnicodeException (..)) import Data.Text.Internal (Text (..)) import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word8, Word32) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Ptr (Ptr, plusPtr) import Foreign.Storable (peek) #if MIN_VERSION_text(2,0,0) import qualified Data.Primitive as P import qualified Data.Text.Array as T import qualified Data.Text.Internal as T import Data.Bits (shiftL, shiftR, (.&.), (.|.)) import Control.Exception (try, throwIO) import Foreign.ForeignPtr (ForeignPtr) import GHC.ForeignPtr (plusForeignPtr) #else import Control.Exception (evaluate, throw, try) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Text.Internal.Private (runText) import Foreign.Marshal.Utils (with) import qualified Data.Text.Array as A import GHC.Base (MutableByteArray#) import Foreign.C.Types (CInt (..), CSize (..)) #endif #if !MIN_VERSION_text(2,0,0) foreign import ccall unsafe "_jstream_decode_string" c_js_decode :: MutableByteArray# s -> Ptr CSize -> Ptr Word8 -> Ptr Word8 -> IO CInt unescapeText' :: ByteString -> Text unescapeText' (PS fp off len) = runText $ \done -> do let go dest = withForeignPtr fp $ \ptr -> with (0::CSize) $ \destOffPtr -> do let end = ptr `plusPtr` (off + len) loop curPtr = do res <- c_js_decode (A.maBA dest) destOffPtr curPtr end case res of 0 -> do n <- peek destOffPtr unsafeSTToIO (done dest (fromIntegral n)) _ -> throw (DecodeError desc Nothing) loop (ptr `plusPtr` off) (unsafeIOToST . go) =<< A.new len where desc = "Data.JsonStream.Unescape.unescapeText': Invalid UTF-8 stream" {-# INLINE unescapeText' #-} unescapeText :: ByteString -> Either UnicodeException Text unescapeText = unsafeDupablePerformIO . try . evaluate . unescapeText' {-# INLINE unescapeText #-} #else withBS :: ByteString -> (ForeignPtr Word8 -> Int -> r) -> r #if MIN_VERSION_bytestring(0,11,0) withBS (BS !sfp !slen) kont = kont sfp slen #else withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen #endif {-# INLINE withBS #-} unescapeText :: ByteString -> Either UnicodeException Text unescapeText = unsafeDupablePerformIO . try . unescapeTextIO throwDecodeError :: IO a throwDecodeError = let desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream" in throwIO (DecodeError desc Nothing) -- The following is copied from aeson-2.0 ------------------------------------------------------------------------------- -- unescapeTextIO ------------------------------------------------------------------------------- -- This function is generated using staged-streams -- See: https://github.com/phadej/staged/blob/master/staged-streams-unicode/src/Unicode/JSON.hs -- -- Because @aeson@ better to not use template-haskell itself, -- we dump the splice and prettify it by hand a bit. -- unescapeTextIO :: ByteString -> IO Text unescapeTextIO bs = withBS bs $ \fptr len -> withForeignPtr fptr $ \begin -> do let end :: Ptr Word8 end = plusPtr begin len arr <- P.newPrimArray len let write3bytes :: Int -> Word8 -> Word8 -> Word8 -> Ptr Word8 -> IO Text write3bytes !out !b1 !b2 !b3 !inp = do P.writePrimArray arr out b1 write2bytes (out + 1) b2 b3 inp write2bytes :: Int -> Word8 -> Word8 -> Ptr Word8 -> IO Text write2bytes !out !b1 !b2 !inp = do P.writePrimArray arr out b1 write1byte (out + 1) b2 inp write1byte :: Int -> Word8 -> Ptr Word8 -> IO Text write1byte !out !b1 !inp = do P.writePrimArray arr out b1 state_start (out + 1) inp writeCodePoint :: Int -> Ptr Word8 -> Word32 -> IO Text writeCodePoint !out !inp !acc | acc <= 127 = do P.writePrimArray arr out (fromIntegral acc :: Word8) state_start (out + 1) (plusPtr inp 1) | acc <= 2047 = do let b1 = fromIntegral (shiftR acc 6 .|. 192) :: Word8 let b2 = fromIntegral ((acc .&. 63) .|. 128) :: Word8 P.writePrimArray arr out b1 write1byte (out + 1) b2 (plusPtr inp 1) | acc <= 65535 = do let b1 = fromIntegral (shiftR acc 12 .|. 224) :: Word8 let b2 = fromIntegral ((shiftR acc 6 .&. 63) .|. 128) :: Word8 let b3 = fromIntegral ((acc .&. 63) .|. 128) :: Word8 P.writePrimArray arr out b1 write2bytes (out + 1) b2 b3 (plusPtr inp 1) | otherwise = do let b1 = fromIntegral (shiftR acc 18 .|. 240) :: Word8 let b2 = fromIntegral ((shiftR acc 12 .&. 63) .|. 128) :: Word8 let b3 = fromIntegral ((shiftR acc 6 .&. 63) .|. 128) :: Word8 let b4 = fromIntegral ((acc .&. 63) .|. 128) :: Word8 P.writePrimArray arr out b1 write3bytes (out + 1) b2 b3 b4 (plusPtr inp 1) state_sudone :: Int -> Ptr Word8 -> Word32 -> Word32 -> IO Text state_sudone !out !inp !hi !lo | 56320 <= lo, lo <= 57343 = writeCodePoint out inp (65536 + (shiftL (hi - 55296) 10 .|. (lo - 56320))) | otherwise = throwDecodeError state_su4 :: Int -> Ptr Word8 -> Word32 -> Word32 -> IO Text state_su4 !out !inp !hi !acc | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_sudone out inp hi (shiftL acc 4 .|. fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_sudone out inp hi (shiftL acc 4 .|. fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_sudone out inp hi (shiftL acc 4 .|. fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_su3 :: Int -> Ptr Word8 -> Word32 -> Word32 -> IO Text state_su3 !out !inp !hi !acc | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_su4 out (plusPtr inp 1) hi (shiftL acc 4 .|. fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_su4 out (plusPtr inp 1) hi (shiftL acc 4 .|. fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_su4 out (plusPtr inp 1) hi (shiftL acc 4 .|. fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_su2 :: Int -> Ptr Word8 -> Word32 -> Word32 -> IO Text state_su2 !out !inp !hi !acc | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_su3 out (plusPtr inp 1) hi (shiftL acc 4 .|. fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_su3 out (plusPtr inp 1) hi (shiftL acc 4 .|. fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_su3 out (plusPtr inp 1) hi (shiftL acc 4 .|. fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_su1 :: Int -> Ptr Word8 -> Word32 -> IO Text state_su1 !out !inp !hi | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_su2 out (plusPtr inp 1) hi (fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_su2 out (plusPtr inp 1) hi (fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_su2 out (plusPtr inp 1) hi (fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_su :: Int -> Ptr Word8 -> Word32 -> IO Text state_su !out !inp !hi | inp == end = throwDecodeError | otherwise = do w8 <- peek inp case w8 of 117 -> state_su1 out (plusPtr inp 1) hi _ -> throwDecodeError state_ss :: Int -> Ptr Word8 -> Word32 -> IO Text state_ss !out !inp !hi | inp == end = throwDecodeError | otherwise = do w8 <- peek inp case w8 of 92 -> state_su out (plusPtr inp 1) hi _ -> throwDecodeError state_udone :: Int -> Ptr Word8 -> Word32 -> IO Text state_udone !out !inp !acc | acc < 55296 || acc > 57343 = writeCodePoint out inp acc | acc < 56320 = state_ss out (plusPtr inp 1) acc | otherwise = throwDecodeError state_u4 :: Int -> Ptr Word8 -> Word32 -> IO Text state_u4 !out !inp !acc | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_udone out inp (shiftL acc 4 .|. fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_udone out inp (shiftL acc 4 .|. fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_udone out inp (shiftL acc 4 .|. fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_u3 :: Int -> Ptr Word8 -> Word32 -> IO Text state_u3 !out !inp !acc | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_u4 out (plusPtr inp 1) (shiftL acc 4 .|. fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_u4 out (plusPtr inp 1) (shiftL acc 4 .|. fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_u4 out (plusPtr inp 1) (shiftL acc 4 .|. fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_u2 :: Int -> Ptr Word8 -> Word32 -> IO Text state_u2 !out !inp !acc | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_u3 out (plusPtr inp 1) (shiftL acc 4 .|. fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_u3 out (plusPtr inp 1) (shiftL acc 4 .|. fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_u3 out (plusPtr inp 1) (shiftL acc 4 .|. fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_u1 :: Int -> Ptr Word8 -> IO Text state_u1 !out !inp | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | 48 <= w8, w8 <= 57 -> state_u2 out (plusPtr inp 1) (fromIntegral (w8 - 48)) | 65 <= w8, w8 <= 70 -> state_u2 out (plusPtr inp 1) (fromIntegral (w8 - 55)) | 97 <= w8, w8 <= 102 -> state_u2 out (plusPtr inp 1) (fromIntegral (w8 - 87)) | otherwise -> throwDecodeError state_escape :: Int -> Ptr Word8 -> IO Text state_escape !out !inp | inp == end = throwDecodeError | otherwise = do w8 <- peek inp case w8 of 34 -> do P.writePrimArray arr out 34 state_start (out + 1) (plusPtr inp 1) 92 -> do P.writePrimArray arr out 92 state_start (out + 1) (plusPtr inp 1) 47 -> do P.writePrimArray arr out 47 state_start (out + 1) (plusPtr inp 1) 98 -> do P.writePrimArray arr out 8 state_start (out + 1) (plusPtr inp 1) 102 -> do P.writePrimArray arr out 12 state_start (out + 1) (plusPtr inp 1) 110 -> do P.writePrimArray arr out 10 state_start (out + 1) (plusPtr inp 1) 114 -> do P.writePrimArray arr out 13 state_start (out + 1) (plusPtr inp 1) 116 -> do P.writePrimArray arr out 9 state_start (out + 1) (plusPtr inp 1) 117 -> state_u1 out (plusPtr inp 1) _ -> throwDecodeError state_input4c :: Int -> Ptr Word8 -> Word8 -> Word8 -> Word8 -> IO Text state_input4c !out !inp !b1 !b2 !b3 | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | (w8 .&. 192) == 128 , let acc = shiftL (fromIntegral (b1 .&. 7)) 18 , let acc' = acc .|. shiftL (fromIntegral (b2 .&. 63)) 12 , let acc'' = acc' .|. shiftL (fromIntegral (b3 .&. 63)) 6 , let acc''' = acc'' .|. fromIntegral (w8 .&. 63) :: Word32 , acc''' >= 65536 && acc''' < 1114112 -> do P.writePrimArray arr out b1 write3bytes (out + 1) b2 b3 w8 (plusPtr inp 1) | otherwise -> throwDecodeError state_input4b :: Int -> Ptr Word8 -> Word8 -> Word8 -> IO Text state_input4b !out !inp !b1 !b2 | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | (w8 .&. 192) == 128 -> state_input4c out (plusPtr inp 1) b1 b2 w8 | otherwise -> throwDecodeError state_input4 :: Int -> Ptr Word8 -> Word8 -> IO Text state_input4 !out !inp !b1 | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | (w8 .&. 192) == 128 -> state_input4b out (plusPtr inp 1) b1 w8 | otherwise -> throwDecodeError state_input3b :: Int -> Ptr Word8 -> Word8 -> Word8 -> IO Text state_input3b !out !inp !b1 !b2 | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | (w8 .&. 192) == 128 , let acc = shiftL (fromIntegral (b1 .&. 15)) 12 , let acc' = acc .|. shiftL (fromIntegral (b2 .&. 63)) 6 , let acc'' = acc' .|. fromIntegral (w8 .&. 63) :: Word32 , (acc'' >= 2048 && acc'' < 55296) || acc'' > 57343 -> do P.writePrimArray arr out b1 write2bytes (out + 1) b2 w8 (plusPtr inp 1) | otherwise -> throwDecodeError state_input3 :: Int -> Ptr Word8 -> Word8 -> IO Text state_input3 !out !inp !b1 | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | (w8 .&. 192) == 128 -> state_input3b out (plusPtr inp 1) b1 w8 | otherwise -> throwDecodeError state_input2 :: Int -> Ptr Word8 -> Word8 -> IO Text state_input2 !out !inp !b1 | inp == end = throwDecodeError | otherwise = do w8 <- peek inp if | (w8 .&. 192) == 128, let acc = shiftL (fromIntegral (b1 .&. 63)) 6 :: Word32 acc' = acc .|. fromIntegral (w8 .&. 63) :: Word32 , acc' >= 128 -> do P.writePrimArray arr out b1 write1byte (out + 1) w8 (plusPtr inp 1) | otherwise -> throwDecodeError state_start :: Int -> Ptr Word8 -> IO Text state_start !out !inp | inp == end = do P.shrinkMutablePrimArray arr out frozenArr <- P.unsafeFreezePrimArray arr return $ case frozenArr of P.PrimArray ba -> T.Text (T.ByteArray ba) 0 out | otherwise = do w8 <- peek inp if | w8 == 92 -> state_escape out (plusPtr inp 1) | w8 < 128 -> do P.writePrimArray arr out w8 state_start (out + 1) (plusPtr inp 1) | w8 < 192 -> throwDecodeError | w8 < 224 -> state_input2 out (plusPtr inp 1) w8 | w8 < 240 -> state_input3 out (plusPtr inp 1) w8 | w8 < 248 -> state_input4 out (plusPtr inp 1) w8 | otherwise -> throwDecodeError -- start the state machine state_start (0 :: Int) begin #endif