{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE BinaryLiterals #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GADTSyntax #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE PolyKinds #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE UnboxedTuples #-}

{- | Parse input as UTF-8-encoded text. Parsers in this module will
fail if they encounter a byte above @0x7F@.
-}
module Data.Bytes.Parser.Utf8
  ( -- * Get Character
    any#
  , shortText
  ) where

import Prelude hiding (any, fail, length, takeWhile)

import Data.Bits (unsafeShiftL, xor, (.&.), (.|.))
import Data.Bytes.Parser.Internal (Parser (..))
import Data.Text.Short (ShortText)
import GHC.Exts (Char (C#), Char#, Int (I#), Int#, chr#, (+#), (-#), (>#))
import GHC.Word (Word8 (W8#))

import qualified Data.ByteString.Short.Internal as BSS
import qualified Data.Bytes.Parser as Parser
import qualified Data.Primitive as PM
import qualified Data.Text.Short as TS
import qualified GHC.Exts as Exts

{- FOURMOLU_DISABLE -}
-- | Interpret the next one to four bytes as a UTF-8-encoded character.
-- Fails if the decoded codepoint is in the range U+D800 through U+DFFF.
any# :: e -> Parser e s Char#
{-# noinline any# #-}
any# e = Parser
  (\(# arr, off, len #) s0 -> case len ># 0# of
    1# ->
      let !w0 = Exts.indexWord8Array# arr off
       in if | oneByteChar (W8# w0) ->
                 (# s0, (# | (# chr# (Exts.word2Int# (
#if MIN_VERSION_base(4,16,0)
                 Exts.word8ToWord#
#endif
                 w0)), off +# 1#, len -# 1# #) #) #)
             | twoByteChar (W8# w0) ->
                 if | I# len > 1
                    , w1 <- Exts.indexWord8Array# arr (off +# 1#)
                    , followingByte (W8# w1)
                    , C# c <- codepointFromTwoBytes (W8# w0) (W8# w1)
                      -> (# s0, (# | (# c, off +# 2#, len -# 2# #) #) #)
                    | otherwise -> (# s0, (# e | #) #)
             | threeByteChar (W8# w0) ->
                 if | I# len > 2
                    , w1 <- Exts.indexWord8Array# arr (off +# 1# )
                    , w2 <- Exts.indexWord8Array# arr (off +# 2# )
                    , followingByte (W8# w1)
                    , !c@(C# c#) <- codepointFromThreeBytes (W8# w0) (W8# w1) (W8# w2)
                    , c < '\xD800' || c > '\xDFFF'
                      -> (# s0, (# | (# c#, off +# 3#, len -# 3# #) #) #)
                    | otherwise -> (# s0, (# e | #) #)
             | fourByteChar (W8# w0) ->
                 if | I# len > 3
                    , w1 <- Exts.indexWord8Array# arr (off +# 1# )
                    , w2 <- Exts.indexWord8Array# arr (off +# 2# )
                    , w3 <- Exts.indexWord8Array# arr (off +# 3# )
                    , followingByte (W8# w1)
                    , !(C# c#) <- codepointFromFourBytes (W8# w0) (W8# w1) (W8# w2) (W8# w3)
                      -> (# s0, (# | (# c#, off +# 4#, len -# 4# #) #) #)
                    | otherwise -> (# s0, (# e | #) #)
             | otherwise -> (# s0, (# e | #) #)
    _ -> (# s0, (# e | #) #)
  )
{- FOURMOLU_ENABLE -}

codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char
codepointFromFourBytes w1 w2 w3 w4 =
  C#
    ( chr#
        ( unI $
            fromIntegral
              ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 18
                  .|. unsafeShiftL (word8ToWord w2 .&. 0b00111111) 12
                  .|. unsafeShiftL (word8ToWord w3 .&. 0b00111111) 6
                  .|. (word8ToWord w4 .&. 0b00111111)
              )
        )
    )

codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char
codepointFromThreeBytes w1 w2 w3 =
  C#
    ( chr#
        ( unI $
            fromIntegral
              ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 12
                  .|. unsafeShiftL (word8ToWord w2 .&. 0b00111111) 6
                  .|. (word8ToWord w3 .&. 0b00111111)
              )
        )
    )

codepointFromTwoBytes :: Word8 -> Word8 -> Char
codepointFromTwoBytes w1 w2 =
  C#
    ( chr#
        ( unI $
            fromIntegral @Word @Int
              ( unsafeShiftL (word8ToWord w1 .&. 0b00011111) 6
                  .|. (word8ToWord w2 .&. 0b00111111)
              )
        )
    )

oneByteChar :: Word8 -> Bool
oneByteChar !w = w .&. 0b10000000 == 0

twoByteChar :: Word8 -> Bool
twoByteChar !w = w .&. 0b11100000 == 0b11000000

threeByteChar :: Word8 -> Bool
threeByteChar !w = w .&. 0b11110000 == 0b11100000

fourByteChar :: Word8 -> Bool
fourByteChar !w = w .&. 0b11111000 == 0b11110000

followingByte :: Word8 -> Bool
followingByte !w = xor w 0b01000000 .&. 0b11000000 == 0b11000000

word8ToWord :: Word8 -> Word
word8ToWord = fromIntegral

unI :: Int -> Int#
unI (I# w) = w

{- | Consume input that matches the argument. Fails if the
input does not match.
-}
shortText :: e -> ShortText -> Parser e s ()
shortText e !t =
  Parser.byteArray
    e
    (shortByteStringToByteArray (TS.toShortByteString t))

shortByteStringToByteArray ::
  BSS.ShortByteString ->
  PM.ByteArray
shortByteStringToByteArray (BSS.SBS x) = PM.ByteArray x