{-# language BangPatterns #-}
{-# language BinaryLiterals #-}
{-# language DataKinds #-}
{-# language DeriveFunctor #-}
{-# language DerivingStrategies #-}
{-# language GADTSyntax #-}
{-# language KindSignatures #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language MultiWayIf #-}
{-# language PolyKinds #-}
{-# language RankNTypes #-}
{-# language ScopedTypeVariables #-}
{-# language StandaloneDeriving #-}
{-# language TypeApplications #-}
{-# language UnboxedSums #-}
{-# language UnboxedTuples #-}
{-# language CPP #-}

-- | Parse input as UTF-8-encoded text. Parsers in this module will
-- fail if they encounter a byte above @0x7F@.
module Data.Bytes.Parser.Utf8
  ( -- * Get Character
    any#
  , shortText
  ) where

import Prelude hiding (length,any,fail,takeWhile)

import Data.Bits ((.&.),(.|.),unsafeShiftL,xor)
import Data.Bytes.Parser.Internal (Parser(..))
import Data.Text.Short (ShortText)
import GHC.Exts (Int(I#),Char(C#),Int#,Char#,(-#),(+#),(>#),chr#)
import GHC.Word (Word8(W8#))

import qualified Data.ByteString.Short.Internal as BSS
import qualified Data.Bytes.Parser as Parser
import qualified Data.Primitive as PM
import qualified Data.Text.Short as TS
import qualified GHC.Exts as Exts

-- | Interpret the next one to four bytes as a UTF-8-encoded character.
-- Fails if the decoded codepoint is in the range U+D800 through U+DFFF.
any# :: e -> Parser e s Char#
{-# noinline any# #-}
any# :: e -> Parser e s Char#
any# e
e = ((# ByteArray#, Int#, Int# #) -> ST# s (Result# e Char#))
-> Parser e s Char#
forall e s a.
((# ByteArray#, Int#, Int# #) -> ST# s (Result# e a))
-> Parser e s a
Parser
  (\(# ByteArray#
arr, Int#
off, Int#
len #) State# s
s0 -> case Int#
len Int# -> Int# -> Int#
># Int#
0# of
    Int#
1# ->
      let !w0 :: Word#
w0 = ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr Int#
off
       in if | Word8 -> Bool
oneByteChar (Word# -> Word8
W8# Word#
w0) ->
                 (# State# s
s0, (# | (# Int# -> Char#
chr# (Word# -> Int#
Exts.word2Int# (
#if MIN_VERSION_base(4,16,0)
                 Exts.word8ToWord#
#endif
                 Word#
w0)), Int#
off Int# -> Int# -> Int#
+# Int#
1#, Int#
len Int# -> Int# -> Int#
-# Int#
1# #) #) #)
             | Word8 -> Bool
twoByteChar (Word# -> Word8
W8# Word#
w0) ->
                 if | Int# -> Int
I# Int#
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1
                    , Word#
w1 <- ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
1#)
                    , Word8 -> Bool
followingByte (Word# -> Word8
W8# Word#
w1)
                    , C# Char#
c <- Word8 -> Word8 -> Char
codepointFromTwoBytes (Word# -> Word8
W8# Word#
w0) (Word# -> Word8
W8# Word#
w1)
                      -> (# State# s
s0, (# | (# Char#
c, Int#
off Int# -> Int# -> Int#
+# Int#
2#, Int#
len Int# -> Int# -> Int#
-# Int#
2# #) #) #)
                    | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
             | Word8 -> Bool
threeByteChar (Word# -> Word8
W8# Word#
w0) ->
                 if | Int# -> Int
I# Int#
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
2
                    , Word#
w1 <- ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
1# )
                    , Word#
w2 <- ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
2# )
                    , Word8 -> Bool
followingByte (Word# -> Word8
W8# Word#
w1)
                    , !c :: Char
c@(C# Char#
c#) <- Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes (Word# -> Word8
W8# Word#
w0) (Word# -> Word8
W8# Word#
w1) (Word# -> Word8
W8# Word#
w2)
                    , Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< Char
'\xD800' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> Char
'\xDFFF'
                      -> (# State# s
s0, (# | (# Char#
c#, Int#
off Int# -> Int# -> Int#
+# Int#
3#, Int#
len Int# -> Int# -> Int#
-# Int#
3# #) #) #)
                    | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
             | Word8 -> Bool
fourByteChar (Word# -> Word8
W8# Word#
w0) ->
                 if | Int# -> Int
I# Int#
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
3
                    , Word#
w1 <- ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
1# )
                    , Word#
w2 <- ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
2# )
                    , Word#
w3 <- ByteArray# -> Int# -> Word#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
3# )
                    , Word8 -> Bool
followingByte (Word# -> Word8
W8# Word#
w1)
                    , !(C# Char#
c#) <- Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes (Word# -> Word8
W8# Word#
w0) (Word# -> Word8
W8# Word#
w1) (Word# -> Word8
W8# Word#
w2) (Word# -> Word8
W8# Word#
w3)
                      -> (# State# s
s0, (# | (# Char#
c#, Int#
off Int# -> Int# -> Int#
+# Int#
4#, Int#
len Int# -> Int# -> Int#
-# Int#
4# #) #) #)
                    | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
             | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
    Int#
_ -> (# State# s
s0, (# e
e | #) #)
  )

codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes Word8
w1 Word8
w2 Word8
w3 Word8
w4 = Char# -> Char
C#
  ( Int# -> Char#
chr#
    ( Int -> Int#
unI (Int -> Int#) -> Int -> Int#
forall a b. (a -> b) -> a -> b
$ Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
      ( Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w1 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00001111) Int
18 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|.
        Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w2 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111) Int
12 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|.
        Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w3 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111) Int
6 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|.
        (Word8 -> Word
word8ToWord Word8
w4 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111)
      )
    )
  )

codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes Word8
w1 Word8
w2 Word8
w3 = Char# -> Char
C#
  ( Int# -> Char#
chr#
    ( Int -> Int#
unI (Int -> Int#) -> Int -> Int#
forall a b. (a -> b) -> a -> b
$ Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
      ( Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w1 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00001111) Int
12 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|.
        Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w2 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111) Int
6 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|.
        (Word8 -> Word
word8ToWord Word8
w3 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111)
      )
    )
  )

codepointFromTwoBytes :: Word8 -> Word8 -> Char
codepointFromTwoBytes :: Word8 -> Word8 -> Char
codepointFromTwoBytes Word8
w1 Word8
w2 = Char# -> Char
C#
  ( Int# -> Char#
chr#
    ( Int -> Int#
unI (Int -> Int#) -> Int -> Int#
forall a b. (a -> b) -> a -> b
$ Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral @Word @Int
      ( Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w1 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00011111) Int
6 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|.
        (Word8 -> Word
word8ToWord Word8
w2 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111)
      )
    )
  )

oneByteChar :: Word8 -> Bool
oneByteChar :: Word8 -> Bool
oneByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b10000000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0

twoByteChar :: Word8 -> Bool
twoByteChar :: Word8 -> Bool
twoByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11100000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11000000

threeByteChar :: Word8 -> Bool
threeByteChar :: Word8 -> Bool
threeByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11110000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11100000

fourByteChar :: Word8 -> Bool
fourByteChar :: Word8 -> Bool
fourByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11111000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11110000

followingByte :: Word8 -> Bool
followingByte :: Word8 -> Bool
followingByte !Word8
w = Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
xor Word8
w Word8
0b01000000 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11000000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11000000

word8ToWord :: Word8 -> Word
word8ToWord :: Word8 -> Word
word8ToWord = Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral

unI :: Int -> Int#
unI :: Int -> Int#
unI (I# Int#
w) = Int#
w

-- | Consume input that matches the argument. Fails if the
-- input does not match.
shortText :: e -> ShortText -> Parser e s ()
shortText :: e -> ShortText -> Parser e s ()
shortText e
e !ShortText
t = e -> ByteArray -> Parser e s ()
forall e s. e -> ByteArray -> Parser e s ()
Parser.byteArray e
e
  (ShortByteString -> ByteArray
shortByteStringToByteArray (ShortText -> ShortByteString
TS.toShortByteString ShortText
t))

shortByteStringToByteArray ::
     BSS.ShortByteString
  -> PM.ByteArray
shortByteStringToByteArray :: ShortByteString -> ByteArray
shortByteStringToByteArray (BSS.SBS ByteArray#
x) = ByteArray# -> ByteArray
PM.ByteArray ByteArray#
x