{-# LANGUAGE UnboxedTuples #-}

-- | Parsers and textual data (UTF-8, ASCII).

module FlatParse.Stateful.Text
  (
  -- * UTF-8
    char, string
  , anyChar, skipAnyChar
  , satisfy, skipSatisfy
  , fusedSatisfy, skipFusedSatisfy
  , takeLine
  , takeRestString

  -- * ASCII
  , anyAsciiChar, skipAnyAsciiChar
  , satisfyAscii, skipSatisfyAscii

  -- ** ASCII-encoded numbers
  , anyAsciiDecimalWord
  , anyAsciiDecimalInt
  , anyAsciiDecimalInteger
  , anyAsciiHexWord
  , anyAsciiHexInt

  -- * Debugging parsers
  , traceLine
  , traceRest
  ) where

import FlatParse.Stateful.Parser
import FlatParse.Stateful.Base ( withEnsure1, lookahead, eof, branch )
import FlatParse.Stateful.Bytes ( bytes )

import FlatParse.Common.GHCExts

import Language.Haskell.TH
import qualified FlatParse.Common.Numbers as Common
import qualified FlatParse.Common.Assorted as Common

-- | Parse any single Unicode character encoded using UTF-8 as a 'Char'.
anyChar :: ParserT st r e Char
anyChar = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
  1# -> Fail# st
  _  -> case Common.derefChar8# buf of
    c1 -> case c1 `leChar#` '\x7F'# of
      1# -> OK# st (C# c1) (plusAddr# buf 1#) n
      _  -> case eqAddr# eob (plusAddr# buf 1#) of
        1# -> Fail# st
        _ -> case indexCharOffAddr# buf 1# of
          c2 -> case c1 `leChar#` '\xDF'# of
            1# ->
              let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#`
                          (ord# c2 -# 0x80#)
              in OK# st (C# (chr# resc)) (plusAddr# buf 2#) n
            _ -> case eqAddr# eob (plusAddr# buf 2#) of
              1# -> Fail# st
              _  -> case indexCharOffAddr# buf 2# of
                c3 -> case c1 `leChar#` '\xEF'# of
                  1# ->
                    let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#`
                               ((ord# c2 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
                                (ord# c3 -# 0x80#)
                    in OK# st (C# (chr# resc)) (plusAddr# buf 3#) n
                  _ -> case eqAddr# eob (plusAddr# buf 3#) of
                    1# -> Fail# st
                    _  -> case indexCharOffAddr# buf 3# of
                      c4 ->
                        let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#`
                                   ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#`
                                   ((ord# c3 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
                                    (ord# c4 -# 0x80#)
                        in OK# st (C# (chr# resc)) (plusAddr# buf 4#) n
{-# inline anyChar #-}

-- | Skip any single Unicode character encoded using UTF-8.
skipAnyChar :: ParserT st r e ()
skipAnyChar = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
  1# -> Fail# st
  _  -> case Common.derefChar8# buf of
    c1 -> case c1 `leChar#` '\x7F'# of
      1# -> OK# st () (plusAddr# buf 1#) n
      _  ->
        let buf' =
              case c1 `leChar#` '\xDF'# of
                1# -> plusAddr# buf 2#
                _  -> case c1 `leChar#` '\xEF'# of
                    1# -> plusAddr# buf 3#
                    _ ->  plusAddr# buf 4#
        in case leAddr# buf' eob of
             1# -> OK#   st () buf' n
             _  -> Fail# st
{-# inline skipAnyChar #-}

withSatisfy
    :: (Char -> Bool) -> (Char -> ParserT st r e ret) -> ParserT st r e ret
withSatisfy f p = ParserT \fp !r eob s n st ->
    case runParserT# anyChar fp r eob s n st of
      -- This is OK# unfolded, to silence incomplete pattern warnings
      -- in GHC <= 8.8.4.
      (# st, (# (# c, s, n #) | | #) #) | f c -> runParserT# (p c) fp r eob s n st
      (# st, _ #)                             -> Fail# st
{-# inline withSatisfy #-}

-- | Parse a UTF-8 'Char' for which a predicate holds.
satisfy :: (Char -> Bool) -> ParserT st r e Char
satisfy f = withSatisfy f pure
{-# inline satisfy #-}

-- | Skip a UTF-8 `Char` for which a predicate holds.
skipSatisfy :: (Char -> Bool) -> ParserT st r e ()
skipSatisfy f = withSatisfy f (\_ -> pure ())
{-# inline skipSatisfy #-}

withSatisfyAscii
    :: (Char -> Bool) -> (Char -> ParserT st r e ret) -> ParserT st r e ret
withSatisfyAscii f p = withEnsure1 $ ParserT \fp !r eob s n st ->
    case Common.derefChar8# s of
      c1 | f (C# c1) -> runParserT# (p (C# c1)) fp r eob (plusAddr# s 1#) n st
         | otherwise -> Fail# st
{-#  inline withSatisfyAscii #-}

-- | Parse an ASCII 'Char' for which a predicate holds.
--
-- Assumption: the predicate must only return 'True' for ASCII-range characters.
-- Otherwise this function might read a 128-255 range byte, thereby breaking
-- UTF-8 decoding.
satisfyAscii :: (Char -> Bool) -> ParserT st r e Char
satisfyAscii f = withSatisfyAscii f pure
{-#  inline satisfyAscii #-}

-- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate
--   must only return `True` for ASCII-range characters.
skipSatisfyAscii :: (Char -> Bool) -> ParserT st r e ()
skipSatisfyAscii f = withSatisfyAscii f (\_ -> pure ())
{-#  inline skipSatisfyAscii #-}

-- | This is a variant of `satisfy` which allows more optimization. We can pick four testing
--   functions for the four cases for the possible number of bytes in the UTF-8 character. So in
--   @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with
--   @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups.
--
--   For example, if we want to accept any letter, the naive solution would be to use
--   `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We
--   can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the
--   `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of
--   all cases without accessing the character table.
fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st r e Char
fusedSatisfy f1 f2 f3 f4 = ParserT \fp !r eob buf n st ->
    case eqAddr# eob buf of
      1# -> Fail# st
      _  -> case Common.derefChar8# buf of
        c1 -> case c1 `leChar#` '\x7F'# of
          1# | f1 (C# c1) -> OK#   st (C# c1) (plusAddr# buf 1#) n
             | otherwise  -> Fail# st
          _  -> case eqAddr# eob (plusAddr# buf 1#) of
            1# -> Fail# st
            _ -> case indexCharOffAddr# buf 1# of
              c2 -> case c1 `leChar#` '\xDF'# of
                1# ->
                  let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#`
                                       (ord# c2 -# 0x80#)))
                  in case f2 resc of
                       True -> OK#   st resc (plusAddr# buf 2#) n
                       _    -> Fail# st
                _ -> case eqAddr# eob (plusAddr# buf 2#) of
                  1# -> Fail# st
                  _  -> case indexCharOffAddr# buf 2# of
                    c3 -> case c1 `leChar#` '\xEF'# of
                      1# ->
                        let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#`
                                             ((ord# c2 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
                                             (ord# c3 -# 0x80#)))
                        in case f3 resc of
                             True -> OK#   st resc (plusAddr# buf 3#) n
                             _    -> Fail# st
                      _ -> case eqAddr# eob (plusAddr# buf 3#) of
                        1# -> Fail# st
                        _  -> case indexCharOffAddr# buf 3# of
                          c4 ->
                            let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#`
                                                 ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#`
                                                 ((ord# c3 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
                                                  (ord# c4 -# 0x80#)))
                            in case f4 resc of
                                 True -> OK#   st resc (plusAddr# buf 4#) n
                                 _    -> Fail# st
{-# inline fusedSatisfy #-}

-- | Skipping variant of `fusedSatisfy`.
skipFusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st r e ()
skipFusedSatisfy f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4
{-# inline skipFusedSatisfy #-}

--------------------------------------------------------------------------------

-- | Parse a non-empty ASCII decimal digit sequence as a 'Word'.
--   Fails on overflow.
anyAsciiDecimalWord :: ParserT st r e Word
anyAsciiDecimalWord = ParserT \fp !r eob s n st ->
    case Common.anyAsciiDecimalWord# eob s of
      (# | (# w, s' #) #) -> OK#   st (W# w) s' n
      (# (##) | #)        -> Fail# st
{-# inline anyAsciiDecimalWord #-}

-- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'.
--   Fails on overflow.
anyAsciiDecimalInt :: ParserT st r e Int
anyAsciiDecimalInt = ParserT \fp !r eob s n st ->
    case Common.anyAsciiDecimalInt# eob s of
      (# | (# i, s' #) #) -> OK#   st (I# i) s' n
      (# (##) | #)        -> Fail# st
{-# inline anyAsciiDecimalInt #-}

-- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'.
anyAsciiDecimalInteger :: ParserT st r e Integer
anyAsciiDecimalInteger = ParserT \fp !r eob s n st ->
    case Common.anyAsciiDecimalInteger# fp eob s of
      (# | (# i, s' #) #) -> OK#   st i s' n
      (# (##) | #)        -> Fail# st
{-# inline anyAsciiDecimalInteger #-}

-- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
--   'Word'.
--   Fails on overflow.
anyAsciiHexWord :: ParserT st r e Word
anyAsciiHexWord = ParserT \fp !r eob s n st ->
    case Common.anyAsciiHexWord# eob s of
      (# | (# w, s' #) #) -> OK#   st (W# w) s' n
      (# (##) | #)        -> Fail# st
{-# inline anyAsciiHexWord #-}

-- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
--   positive 'Int'.
--   Fails on overflow.
anyAsciiHexInt :: ParserT st r e Int
anyAsciiHexInt = ParserT \fp !r eob s n st ->
    case Common.anyAsciiHexInt# eob s of
      (# | (# i, s' #) #) -> OK#   st (I# i) s' n
      (# (##) | #)        -> Fail# st
{-# inline anyAsciiHexInt #-}

--------------------------------------------------------------------------------

-- | Parse any single ASCII character (a single byte) as a 'Char'.
--
-- More efficient than 'anyChar' for ASCII-only input.
anyAsciiChar :: ParserT st r e Char
anyAsciiChar = withEnsure1 $ ParserT \fp !r eob buf n st ->
    case Common.derefChar8# buf of
      c1 -> case c1 `leChar#` '\x7F'# of
              1# -> OK#   st (C# c1) (plusAddr# buf 1#) n
              _  -> Fail# st
{-# inline anyAsciiChar #-}

-- | Skip any single ASCII character (a single byte).
--
-- More efficient than 'anyChar_' for ASCII-only input.
skipAnyAsciiChar :: ParserT st r e ()
skipAnyAsciiChar = () <$ anyAsciiChar
{-# inline skipAnyAsciiChar #-}

--------------------------------------------------------------------------------

-- | Parse a UTF-8 character literal. This is a template function, you can use it as
--   @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@.
char :: Char -> Q Exp
char c = string [c]

-- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@,
--   for example, and the splice has type @Parser e ()@.
string :: String -> Q Exp
string str = bytes (Common.strToBytes str)

--------------------------------------------------------------------------------

-- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding,
--   throws an error if the encoding is invalid.
takeLine :: ParserT st r e String
takeLine = branch eof (pure "") do
  c <- anyChar
  case c of
    '\n' -> pure ""
    _    -> (c:) <$> takeLine

-- | Parse the rest of the current line as a `String`, but restore the parsing state.
--   Assumes UTF-8 encoding. This can be used for debugging.
traceLine :: ParserT st r e String
traceLine = lookahead takeLine

-- | Take the rest of the input as a `String`. Assumes UTF-8 encoding.
takeRestString :: ParserT st r e String
takeRestString = branch eof (pure "") do
  c <- anyChar
  cs <- takeRestString
  pure (c:cs)

-- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding.
--   This can be used for debugging.
traceRest :: ParserT st r e String
traceRest = lookahead takeRestString