-- |
-- Module      : Streamly.Internal.Unicode.Parser
-- Copyright   : (c) 2021 Composewell Technologies
-- License     : BSD-3-Clause
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
-- Portability : GHC
--
-- To parse a text input, use the decode routines from
-- "Streamly.Unicode.Stream" module to convert an input byte stream to a
-- Unicode Char stream and then use these parsers on the Char stream.

module Streamly.Internal.Unicode.Parser
    (
    -- * Generic
      char
    , charIgnoreCase

    -- * Sequences
    , string
    , stringIgnoreCase
    , dropSpace
    , dropSpace1

    -- * Classes
    , alpha
    , alphaNum
    , letter
    , ascii
    , asciiLower
    , asciiUpper
    , latin1
    , lower
    , upper
    , mark
    , printable
    , punctuation
    , separator
    , space
    , symbol

    -- digits
    , digit
    , octDigit
    , hexDigit
    , numeric

    -- * Numeric
    , signed
    , double
    , decimal
    , hexadecimal
    )
where

import Control.Applicative (Alternative(..))
import Data.Bits (Bits, (.|.), shiftL)
import Data.Char (ord)
import Streamly.Internal.Data.Parser (Parser)

import qualified Data.Char as Char
import qualified Streamly.Data.Fold as Fold
import qualified Streamly.Internal.Data.Parser as Parser
    (
      lmap
    , satisfy
    , listEq
    , takeWhile1
    , dropWhile
    )

--------------------------------------------------------------------------------
-- Character classification
--------------------------------------------------------------------------------

-- XXX It may be possible to implement faster predicates for ASCII byte stream.
-- We can measure if there is a signficant difference and if so we can add such
-- predicates to Streamly.Unicode.Parser.Latin1.
--
#define CHAR_PARSER_SIG(NAME)         NAME :: Monad m => Parser Char m Char
-- XXX Need to use the predicates from Unicode.Char module/unicode-data package
#define CHAR_PARSER(NAME, PREDICATE)  NAME = Parser.satisfy Char.PREDICATE
#define CHAR_PARSER_DOC(PREDICATE) -- | Match any character that satisfies 'Char.PREDICATE'
#define CHAR_PARSER_INLINE(NAME)      {-# INLINE NAME #-}

CHAR_PARSER_DOC(isSpace)
CHAR_PARSER_INLINE(space)
CHAR_PARSER_SIG(space)
CHAR_PARSER(space,isSpace)

CHAR_PARSER_DOC(isLower)
CHAR_PARSER_INLINE(lower)
CHAR_PARSER_SIG(lower)
CHAR_PARSER(lower,isLower)

CHAR_PARSER_DOC(isUpper)
CHAR_PARSER_INLINE(upper)
CHAR_PARSER_SIG(upper)
CHAR_PARSER(upper,isUpper)

CHAR_PARSER_DOC(isAlpha)
CHAR_PARSER_INLINE(alpha)
CHAR_PARSER_SIG(alpha)
CHAR_PARSER(alpha,isAlpha)

CHAR_PARSER_DOC(isAlphaNum)
CHAR_PARSER_INLINE(alphaNum)
CHAR_PARSER_SIG(alphaNum)
CHAR_PARSER(alphaNum,isAlphaNum)

CHAR_PARSER_DOC(isPrint)
CHAR_PARSER_INLINE(printable)
CHAR_PARSER_SIG(printable)
CHAR_PARSER(printable,isPrint)

CHAR_PARSER_DOC(isDigit)
CHAR_PARSER_INLINE(digit)
CHAR_PARSER_SIG(digit)
CHAR_PARSER(digit,isDigit)

CHAR_PARSER_DOC(isOctDigit)
CHAR_PARSER_INLINE(octDigit)
CHAR_PARSER_SIG(octDigit)
CHAR_PARSER(octDigit,isOctDigit)

CHAR_PARSER_DOC(isHexDigit)
CHAR_PARSER_INLINE(hexDigit)
CHAR_PARSER_SIG(hexDigit)
CHAR_PARSER(hexDigit,isHexDigit)

CHAR_PARSER_DOC(isLetter)
CHAR_PARSER_INLINE(letter)
CHAR_PARSER_SIG(letter)
CHAR_PARSER(letter,isLetter)

CHAR_PARSER_DOC(isMark)
CHAR_PARSER_INLINE(mark)
CHAR_PARSER_SIG(mark)
CHAR_PARSER(mark,isMark)

CHAR_PARSER_DOC(isNumber)
CHAR_PARSER_INLINE(numeric)
CHAR_PARSER_SIG(numeric)
CHAR_PARSER(numeric,isNumber)

CHAR_PARSER_DOC(isPunctuation)
CHAR_PARSER_INLINE(punctuation)
CHAR_PARSER_SIG(punctuation)
punctuation :: Parser Char m Char
CHAR_PARSER(punctuation,isPunctuation)

CHAR_PARSER_DOC(isSymbol)
CHAR_PARSER_INLINE(symbol)
CHAR_PARSER_SIG(symbol)
CHAR_PARSER(symbol,isSymbol)

CHAR_PARSER_DOC(isSeparator)
CHAR_PARSER_INLINE(separator)
CHAR_PARSER_SIG(separator)
CHAR_PARSER(separator,isSeparator)

CHAR_PARSER_DOC(isAscii)
CHAR_PARSER_INLINE(ascii)
CHAR_PARSER_SIG(ascii)
CHAR_PARSER(ascii,isAscii)

CHAR_PARSER_DOC(isLatin1)
CHAR_PARSER_INLINE(latin1)
CHAR_PARSER_SIG(latin1)
CHAR_PARSER(latin1,isLatin1)

CHAR_PARSER_DOC(isAsciiUpper)
CHAR_PARSER_INLINE(asciiUpper)
CHAR_PARSER_SIG(asciiUpper)
CHAR_PARSER(asciiUpper,isAsciiUpper)

CHAR_PARSER_DOC(isAsciiLower)
CHAR_PARSER_INLINE(asciiLower)
CHAR_PARSER_SIG(asciiLower)
CHAR_PARSER(asciiLower,isAsciiLower)

--------------------------------------------------------------------------------
-- Character parsers
--------------------------------------------------------------------------------

-- | Match a specific character.
{-# INLINE char #-}
char :: Monad m => Char -> Parser Char m Char
char :: Char -> Parser Char m Char
char Char
c = (Char -> Bool) -> Parser Char m Char
forall (m :: * -> *) a. Monad m => (a -> Bool) -> Parser a m a
Parser.satisfy (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c)

-- XXX Case conversion may lead to change in number of chars
-- | Match a specific character ignoring case.
{-# INLINE charIgnoreCase #-}
charIgnoreCase :: Monad m => Char -> Parser Char m Char
charIgnoreCase :: Char -> Parser Char m Char
charIgnoreCase Char
c = (Char -> Char) -> Parser Char m Char -> Parser Char m Char
forall a b (m :: * -> *) r.
(a -> b) -> Parser b m r -> Parser a m r
Parser.lmap Char -> Char
Char.toLower ((Char -> Bool) -> Parser Char m Char
forall (m :: * -> *) a. Monad m => (a -> Bool) -> Parser a m a
Parser.satisfy (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Char
Char.toLower Char
c))

--------------------------------------------------------------------------------
-- Character sequences
--------------------------------------------------------------------------------

-- | Match the input with the supplied string and return it if successful.
string :: Monad m => String -> Parser Char m String
string :: String -> Parser Char m String
string = String -> Parser Char m String
forall (m :: * -> *) a. (Monad m, Eq a) => [a] -> Parser a m [a]
Parser.listEq

-- XXX Not accurate unicode case conversion
-- | Match the input with the supplied string and return it if successful.
stringIgnoreCase :: Monad m => String -> Parser Char m String
stringIgnoreCase :: String -> Parser Char m String
stringIgnoreCase String
s =
    (Char -> Char) -> Parser Char m String -> Parser Char m String
forall a b (m :: * -> *) r.
(a -> b) -> Parser b m r -> Parser a m r
Parser.lmap Char -> Char
Char.toLower (String -> Parser Char m String
forall (m :: * -> *) a. (Monad m, Eq a) => [a] -> Parser a m [a]
Parser.listEq ((Char -> Char) -> String -> String
forall a b. (a -> b) -> [a] -> [b]
map Char -> Char
Char.toLower String
s))

-- | Drop /zero/ or more white space characters.
dropSpace :: Monad m => Parser Char m ()
dropSpace :: Parser Char m ()
dropSpace = (Char -> Bool) -> Parser Char m ()
forall (m :: * -> *) a. Monad m => (a -> Bool) -> Parser a m ()
Parser.dropWhile Char -> Bool
Char.isSpace

-- | Drop /one/ or more white space characters.
dropSpace1 :: Monad m => Parser Char m ()
dropSpace1 :: Parser Char m ()
dropSpace1 = (Char -> Bool) -> Fold m Char () -> Parser Char m ()
forall (m :: * -> *) a b.
Monad m =>
(a -> Bool) -> Fold m a b -> Parser a m b
Parser.takeWhile1 Char -> Bool
Char.isSpace Fold m Char ()
forall (m :: * -> *) a. Monad m => Fold m a ()
Fold.drain

--------------------------------------------------------------------------------
-- Numeric parsers
--------------------------------------------------------------------------------

-- XXX It should fail if the number is larger than the size of the type.
--
-- | Parse and decode an unsigned integral decimal number.
{-# INLINE decimal #-}
decimal :: (Monad m, Integral a) => Parser Char m a
decimal :: Parser Char m a
decimal = (Char -> Bool) -> Fold m Char a -> Parser Char m a
forall (m :: * -> *) a b.
Monad m =>
(a -> Bool) -> Fold m a b -> Parser a m b
Parser.takeWhile1 Char -> Bool
Char.isDigit ((a -> Char -> a) -> a -> Fold m Char a
forall (m :: * -> *) b a.
Monad m =>
(b -> a -> b) -> b -> Fold m a b
Fold.foldl' a -> Char -> a
forall a. Num a => a -> Char -> a
step a
0)

    where

    step :: a -> Char -> a
step a
a Char
c = a
a a -> a -> a
forall a. Num a => a -> a -> a
* a
10 a -> a -> a
forall a. Num a => a -> a -> a
+ Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
48)

-- | Parse and decode an unsigned integral hexadecimal number.  The hex digits
-- @\'a\'@ through @\'f\'@ may be upper or lower case.
--
-- Note: This parser does not accept a leading @\"0x\"@ string.
{-# INLINE hexadecimal #-}
hexadecimal :: (Monad m, Integral a, Bits a) => Parser Char m a
hexadecimal :: Parser Char m a
hexadecimal = (Char -> Bool) -> Fold m Char a -> Parser Char m a
forall (m :: * -> *) a b.
Monad m =>
(a -> Bool) -> Fold m a b -> Parser a m b
Parser.takeWhile1 Char -> Bool
isHexDigit ((a -> Char -> a) -> a -> Fold m Char a
forall (m :: * -> *) b a.
Monad m =>
(b -> a -> b) -> b -> Fold m a b
Fold.foldl' a -> Char -> a
forall a. (Bits a, Num a) => a -> Char -> a
step a
0)

    where

    isHexDigit :: Char -> Bool
isHexDigit Char
c =
           (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'9')
        Bool -> Bool -> Bool
|| (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'a' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'f')
        Bool -> Bool -> Bool
|| (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'A' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'F')

    step :: a -> Char -> a
step a
a Char
c
        | Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
48 Bool -> Bool -> Bool
&& Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
57 =
            (a
a a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftL` Int
4) a -> a -> a
forall a. Bits a => a -> a -> a
.|. Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
48)
        | Int
w Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
97 =
            (a
a a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftL` Int
4) a -> a -> a
forall a. Bits a => a -> a -> a
.|. Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
87)
        | Bool
otherwise =
            (a
a a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftL` Int
4) a -> a -> a
forall a. Bits a => a -> a -> a
.|. Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
55)

        where

        w :: Int
w = Char -> Int
ord Char
c

-- | Allow an optional leading @\'+\'@ or @\'-\'@ sign character before any
-- parser.
{-# INLINE signed #-}
signed :: (Num a, Monad m) => Parser Char m a -> Parser Char m a
signed :: Parser Char m a -> Parser Char m a
signed Parser Char m a
p = (a -> a
forall a. Num a => a -> a
negate (a -> a) -> Parser Char m a -> Parser Char m a
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Parser Char m Char
forall (m :: * -> *). Monad m => Char -> Parser Char m Char
char Char
'-' Parser Char m Char -> Parser Char m a -> Parser Char m a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Char m a
p)) Parser Char m a -> Parser Char m a -> Parser Char m a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Char m Char
forall (m :: * -> *). Monad m => Char -> Parser Char m Char
char Char
'+' Parser Char m Char -> Parser Char m a -> Parser Char m a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Char m a
p) Parser Char m a -> Parser Char m a -> Parser Char m a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Char m a
p

-- | Parse a 'Double'.
--
-- This parser accepts an optional leading sign character, followed by
-- at most one decimal digit.  The syntax is similar to that accepted by
-- the 'read' function, with the exception that a trailing @\'.\'@ is
-- consumed.
--
-- === Examples
--
-- Examples with behaviour identical to 'read', if you feed an empty
-- continuation to the first result:
--
-- > IS.parse double (IS.fromList "3")     == 3.0
-- > IS.parse double (IS.fromList "3.1")   == 3.1
-- > IS.parse double (IS.fromList "3e4")   == 30000.0
-- > IS.parse double (IS.fromList "3.1e4") == 31000.0
-- > IS.parse double (IS.fromList "3e")    == 30
--
-- Examples with behaviour identical to 'read':
--
-- > IS.parse (IS.fromList ".3")    == error "Parse failed"
-- > IS.parse (IS.fromList "e3")    == error "Parse failed"
--
-- Example of difference from 'read':
--
-- > IS.parse double (IS.fromList "3.foo") == 3.0
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
--
-- /Unimplemented/
double :: Parser Char m Double
double :: Parser Char m Double
double = Parser Char m Double
forall a. HasCallStack => a
undefined