{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE BinaryLiterals #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GADTSyntax #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE PolyKinds #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE UnboxedTuples #-}

{- | Parse input as UTF-8-encoded text. Parsers in this module will
fail if they encounter a byte above @0x7F@.
-}
module Data.Bytes.Parser.Utf8
  ( -- * Get Character
    any#
  , shortText
  ) where

import Prelude hiding (any, fail, length, takeWhile)

import Data.Bits (unsafeShiftL, xor, (.&.), (.|.))
import Data.Bytes.Parser.Internal (Parser (..))
import Data.Text.Short (ShortText)
import GHC.Exts (Char (C#), Char#, Int (I#), Int#, chr#, (+#), (-#), (>#))
import GHC.Word (Word8 (W8#))

import qualified Data.ByteString.Short.Internal as BSS
import qualified Data.Bytes.Parser as Parser
import qualified Data.Primitive as PM
import qualified Data.Text.Short as TS
import qualified GHC.Exts as Exts

{- FOURMOLU_DISABLE -}
-- | Interpret the next one to four bytes as a UTF-8-encoded character.
-- Fails if the decoded codepoint is in the range U+D800 through U+DFFF.
any# :: e -> Parser e s Char#
{-# noinline any# #-}
any# :: forall e s. e -> Parser e s Char#
any# e
e = ((# ByteArray#, Int#, Int# #) -> ST# s (Result# e Char#))
-> Parser e s Char#
forall a b c.
((# ByteArray#, Int#, Int# #) -> ST# b (Result# a c))
-> Parser a b c
Parser
  (\(# ByteArray#
arr, Int#
off, Int#
len #) State# s
s0 -> case Int#
len Int# -> Int# -> Int#
># Int#
0# of
    Int#
1# ->
      let !w0 :: Word8#
w0 = ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr Int#
off
       in if | Word8 -> Bool
oneByteChar (Word8# -> Word8
W8# Word8#
w0) ->
                 (# State# s
s0, (# | (# Int# -> Char#
chr# (Word# -> Int#
Exts.word2Int# (
#if MIN_VERSION_base(4,16,0)
                 Word8# -> Word#
Exts.word8ToWord#
#endif
                 Word8#
w0)), Int#
off Int# -> Int# -> Int#
+# Int#
1#, Int#
len Int# -> Int# -> Int#
-# Int#
1# #) #) #)
             | Word8 -> Bool
twoByteChar (Word8# -> Word8
W8# Word8#
w0) ->
                 if | Int# -> Int
I# Int#
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1
                    , Word8#
w1 <- ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
1#)
                    , Word8 -> Bool
followingByte (Word8# -> Word8
W8# Word8#
w1)
                    , C# Char#
c <- Word8 -> Word8 -> Char
codepointFromTwoBytes (Word8# -> Word8
W8# Word8#
w0) (Word8# -> Word8
W8# Word8#
w1)
                      -> (# State# s
s0, (# | (# Char#
c, Int#
off Int# -> Int# -> Int#
+# Int#
2#, Int#
len Int# -> Int# -> Int#
-# Int#
2# #) #) #)
                    | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
             | Word8 -> Bool
threeByteChar (Word8# -> Word8
W8# Word8#
w0) ->
                 if | Int# -> Int
I# Int#
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
2
                    , Word8#
w1 <- ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
1# )
                    , Word8#
w2 <- ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
2# )
                    , Word8 -> Bool
followingByte (Word8# -> Word8
W8# Word8#
w1)
                    , !c :: Char
c@(C# Char#
c#) <- Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes (Word8# -> Word8
W8# Word8#
w0) (Word8# -> Word8
W8# Word8#
w1) (Word8# -> Word8
W8# Word8#
w2)
                    , Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< Char
'\xD800' Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> Char
'\xDFFF'
                      -> (# State# s
s0, (# | (# Char#
c#, Int#
off Int# -> Int# -> Int#
+# Int#
3#, Int#
len Int# -> Int# -> Int#
-# Int#
3# #) #) #)
                    | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
             | Word8 -> Bool
fourByteChar (Word8# -> Word8
W8# Word8#
w0) ->
                 if | Int# -> Int
I# Int#
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
3
                    , Word8#
w1 <- ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
1# )
                    , Word8#
w2 <- ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
2# )
                    , Word8#
w3 <- ByteArray# -> Int# -> Word8#
Exts.indexWord8Array# ByteArray#
arr (Int#
off Int# -> Int# -> Int#
+# Int#
3# )
                    , Word8 -> Bool
followingByte (Word8# -> Word8
W8# Word8#
w1)
                    , !(C# Char#
c#) <- Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes (Word8# -> Word8
W8# Word8#
w0) (Word8# -> Word8
W8# Word8#
w1) (Word8# -> Word8
W8# Word8#
w2) (Word8# -> Word8
W8# Word8#
w3)
                      -> (# State# s
s0, (# | (# Char#
c#, Int#
off Int# -> Int# -> Int#
+# Int#
4#, Int#
len Int# -> Int# -> Int#
-# Int#
4# #) #) #)
                    | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
             | Bool
otherwise -> (# State# s
s0, (# e
e | #) #)
    Int#
_ -> (# State# s
s0, (# e
e | #) #)
  )
{- FOURMOLU_ENABLE -}

codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes Word8
w1 Word8
w2 Word8
w3 Word8
w4 =
  Char# -> Char
C#
    ( Int# -> Char#
chr#
        ( Int -> Int#
unI (Int -> Int#) -> Int -> Int#
forall a b. (a -> b) -> a -> b
$
            Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
              ( Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w1 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00001111) Int
18
                  Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w2 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111) Int
12
                  Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w3 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111) Int
6
                  Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word8 -> Word
word8ToWord Word8
w4 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111)
              )
        )
    )

codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes Word8
w1 Word8
w2 Word8
w3 =
  Char# -> Char
C#
    ( Int# -> Char#
chr#
        ( Int -> Int#
unI (Int -> Int#) -> Int -> Int#
forall a b. (a -> b) -> a -> b
$
            Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
              ( Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w1 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00001111) Int
12
                  Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w2 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111) Int
6
                  Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word8 -> Word
word8ToWord Word8
w3 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111)
              )
        )
    )

codepointFromTwoBytes :: Word8 -> Word8 -> Char
codepointFromTwoBytes :: Word8 -> Word8 -> Char
codepointFromTwoBytes Word8
w1 Word8
w2 =
  Char# -> Char
C#
    ( Int# -> Char#
chr#
        ( Int -> Int#
unI (Int -> Int#) -> Int -> Int#
forall a b. (a -> b) -> a -> b
$
            forall a b. (Integral a, Num b) => a -> b
fromIntegral @Word @Int
              ( Word -> Int -> Word
forall a. Bits a => a -> Int -> a
unsafeShiftL (Word8 -> Word
word8ToWord Word8
w1 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00011111) Int
6
                  Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word8 -> Word
word8ToWord Word8
w2 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0b00111111)
              )
        )
    )

oneByteChar :: Word8 -> Bool
oneByteChar :: Word8 -> Bool
oneByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b10000000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0

twoByteChar :: Word8 -> Bool
twoByteChar :: Word8 -> Bool
twoByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11100000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11000000

threeByteChar :: Word8 -> Bool
threeByteChar :: Word8 -> Bool
threeByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11110000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11100000

fourByteChar :: Word8 -> Bool
fourByteChar :: Word8 -> Bool
fourByteChar !Word8
w = Word8
w Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11111000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11110000

followingByte :: Word8 -> Bool
followingByte :: Word8 -> Bool
followingByte !Word8
w = Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
xor Word8
w Word8
0b01000000 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0b11000000 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0b11000000

word8ToWord :: Word8 -> Word
word8ToWord :: Word8 -> Word
word8ToWord = Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral

unI :: Int -> Int#
unI :: Int -> Int#
unI (I# Int#
w) = Int#
w

{- | Consume input that matches the argument. Fails if the
input does not match.
-}
shortText :: e -> ShortText -> Parser e s ()
shortText :: forall e s. e -> ShortText -> Parser e s ()
shortText e
e !ShortText
t =
  e -> ByteArray -> Parser e s ()
forall e s. e -> ByteArray -> Parser e s ()
Parser.byteArray
    e
e
    (ShortByteString -> ByteArray
shortByteStringToByteArray (ShortText -> ShortByteString
TS.toShortByteString ShortText
t))

shortByteStringToByteArray ::
  BSS.ShortByteString ->
  PM.ByteArray
shortByteStringToByteArray :: ShortByteString -> ByteArray
shortByteStringToByteArray (BSS.SBS ByteArray#
x) = ByteArray# -> ByteArray
PM.ByteArray ByteArray#
x