module Language.Lexer.Tlex.Plugin.Encoding.UTF8 (
charSetPUtf8,
) where
import Language.Lexer.Tlex.Prelude
import qualified Data.CharSet as CharSet
import qualified Data.EnumMap.Strict as EnumMap
import qualified Data.EnumSet as EnumSet
import qualified Data.IntSet as IntSet
import qualified Language.Lexer.Tlex.Data.NonEmptyEnumStringSet as NonEmptyEnumStringSet
import qualified Language.Lexer.Tlex.Plugin.Encoding.CharSetP as CharSetP
import qualified Language.Lexer.Tlex.Syntax as Tlex
charSetPUtf8 :: CharSetP.CharSetEncoder m => CharSetP.CharSetP m
charSetPUtf8 :: CharSetP m
charSetPUtf8 = CharSetP :: forall (m :: * -> *). (CharSet -> m Pattern) -> CharSetP m
CharSetP.CharSetP
{ $sel:charSetEncodingP:CharSetP :: CharSet -> m Pattern
CharSetP.charSetEncodingP = \case
CharSet.CharSet Bool
True ByteSet
_ IntSet
is -> IntSet -> m Pattern
forall (m :: * -> *). CharSetEncoder m => IntSet -> m Pattern
goStraight IntSet
is
CharSet.CharSet Bool
False ByteSet
_ IntSet
is -> IntSet -> m Pattern
forall (m :: * -> *). CharSetEncoder m => IntSet -> m Pattern
goComplement IntSet
is
}
where
goStraight :: IntSet -> m Pattern
goStraight IntSet
is = do
NonEmptyEnumStringSet Word8
bsSet <- IntSet -> m (NonEmptyEnumStringSet Word8)
forall (m :: * -> *).
CharSetEncoder m =>
IntSet -> m (NonEmptyEnumStringSet Word8)
charSetToByteStringSetUtf8 IntSet
is
Pattern -> m Pattern
forall (f :: * -> *) a. Applicative f => a -> f a
pure do NonEmptyEnumStringSet Word8 -> Pattern
forall e. Enum e => NonEmptyEnumStringSet e -> Pattern e
straightP NonEmptyEnumStringSet Word8
bsSet
straightP :: NonEmptyEnumStringSet e -> Pattern e
straightP NonEmptyEnumStringSet e
s =
let singleByteP :: Pattern e
singleByteP = EnumSet e -> Pattern e
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP do
NonEmptyEnumStringSet e -> EnumSet e
forall a. NonEmptyEnumStringSet a -> EnumSet a
NonEmptyEnumStringSet.singleEnums NonEmptyEnumStringSet e
s
in [Pattern e] -> Pattern e
forall e. Enum e => [Pattern e] -> Pattern e
Tlex.orP do
Pattern e
singleBytePPattern e -> [Pattern e] -> [Pattern e]
forall a. a -> [a] -> [a]
:
[ [e] -> Pattern e
forall e. Enum e => [e] -> Pattern e
Tlex.enumsP [e
c] Pattern e -> Pattern e -> Pattern e
forall a. Semigroup a => a -> a -> a
<> NonEmptyEnumStringSet e -> Pattern e
straightP NonEmptyEnumStringSet e
s'
| (e
c, NonEmptyEnumStringSet e
s') <- EnumMap e (NonEmptyEnumStringSet e)
-> [(e, NonEmptyEnumStringSet e)]
forall k a. Enum k => EnumMap k a -> [(k, a)]
EnumMap.assocs do
NonEmptyEnumStringSet e -> EnumMap e (NonEmptyEnumStringSet e)
forall a.
NonEmptyEnumStringSet a -> EnumMap a (NonEmptyEnumStringSet a)
NonEmptyEnumStringSet.enumStrings NonEmptyEnumStringSet e
s
]
goComplement :: IntSet -> m Pattern
goComplement IntSet
is = do
NonEmptyEnumStringSet Word8
bsSet <- IntSet -> m (NonEmptyEnumStringSet Word8)
forall (m :: * -> *).
CharSetEncoder m =>
IntSet -> m (NonEmptyEnumStringSet Word8)
charSetToByteStringSetUtf8 IntSet
is
Pattern -> m Pattern
forall (f :: * -> *) a. Applicative f => a -> f a
pure do NonEmptyEnumStringSet Word8 -> Pattern
complementPFromEnumStrings NonEmptyEnumStringSet Word8
bsSet
charSetToByteStringSetUtf8 :: CharSetP.CharSetEncoder m
=> IntSet.IntSet -> m (NonEmptyEnumStringSet.NonEmptyEnumStringSet Word8)
charSetToByteStringSetUtf8 :: IntSet -> m (NonEmptyEnumStringSet Word8)
charSetToByteStringSetUtf8 IntSet
is = (NonEmptyEnumStringSet Word8
-> Int -> m (NonEmptyEnumStringSet Word8))
-> NonEmptyEnumStringSet Word8
-> [Int]
-> m (NonEmptyEnumStringSet Word8)
forall (t :: * -> *) (m :: * -> *) b a.
(Foldable t, Monad m) =>
(b -> a -> m b) -> b -> t a -> m b
foldM
do \NonEmptyEnumStringSet Word8
s Int
c -> NonEmptyEnumStringSet Word8
-> Int -> m (NonEmptyEnumStringSet Word8)
forall (f :: * -> *).
CharSetEncoder f =>
NonEmptyEnumStringSet Word8
-> Int -> f (NonEmptyEnumStringSet Word8)
foldStep NonEmptyEnumStringSet Word8
s Int
c
do NonEmptyEnumStringSet Word8
forall a. Enum a => NonEmptyEnumStringSet a
NonEmptyEnumStringSet.empty
do IntSet -> [Int]
IntSet.toAscList IntSet
is
where
foldStep :: NonEmptyEnumStringSet Word8
-> Int -> f (NonEmptyEnumStringSet Word8)
foldStep NonEmptyEnumStringSet Word8
s Int
c = if
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7F -> NonEmptyEnumStringSet Word8 -> f (NonEmptyEnumStringSet Word8)
forall (f :: * -> *) a. Applicative f => a -> f a
pure do
Word8 -> NonEmptyEnumStringSet Word8 -> NonEmptyEnumStringSet Word8
forall a.
Enum a =>
a -> NonEmptyEnumStringSet a -> NonEmptyEnumStringSet a
NonEmptyEnumStringSet.insertSingleByte
do Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
c
do NonEmptyEnumStringSet Word8
s
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7FF ->
let (Word8
c', [Word8]
l) = Int -> Int -> (Word8, [Word8])
stringTails Int
c Int
1
in NonEmptyEnumStringSet Word8 -> f (NonEmptyEnumStringSet Word8)
forall (f :: * -> *) a. Applicative f => a -> f a
pure do
NonEmpty Word8
-> NonEmptyEnumStringSet Word8 -> NonEmptyEnumStringSet Word8
forall a.
Enum a =>
NonEmpty a -> NonEmptyEnumStringSet a -> NonEmptyEnumStringSet a
NonEmptyEnumStringSet.insert
do (Word8
0xC0 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
c') Word8 -> [Word8] -> NonEmpty Word8
forall a. a -> [a] -> NonEmpty a
:| [Word8]
l
do NonEmptyEnumStringSet Word8
s
| Int
0xD800 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
c Bool -> Bool -> Bool
&& Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xDFFF -> do
EncodeWarning -> f ()
forall (m :: * -> *). CharSetEncoder m => EncodeWarning -> m ()
CharSetP.reportEncodeWarning
do Char -> EncodeWarning
CharSetP.NotSupportedChar do Int -> Char
forall a. Enum a => Int -> a
toEnum Int
c
NonEmptyEnumStringSet Word8 -> f (NonEmptyEnumStringSet Word8)
forall (f :: * -> *) a. Applicative f => a -> f a
pure NonEmptyEnumStringSet Word8
s
| Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xFFFF ->
let (Word8
c', [Word8]
l) = Int -> Int -> (Word8, [Word8])
stringTails Int
c Int
2
in NonEmptyEnumStringSet Word8 -> f (NonEmptyEnumStringSet Word8)
forall (f :: * -> *) a. Applicative f => a -> f a
pure do
NonEmpty Word8
-> NonEmptyEnumStringSet Word8 -> NonEmptyEnumStringSet Word8
forall a.
Enum a =>
NonEmpty a -> NonEmptyEnumStringSet a -> NonEmptyEnumStringSet a
NonEmptyEnumStringSet.insert
do (Word8
0xE0 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
c') Word8 -> [Word8] -> NonEmpty Word8
forall a. a -> [a] -> NonEmpty a
:| [Word8]
l
do NonEmptyEnumStringSet Word8
s
| Bool
otherwise ->
let (Word8
c', [Word8]
l) = Int -> Int -> (Word8, [Word8])
stringTails Int
c Int
3
in NonEmptyEnumStringSet Word8 -> f (NonEmptyEnumStringSet Word8)
forall (f :: * -> *) a. Applicative f => a -> f a
pure do
NonEmpty Word8
-> NonEmptyEnumStringSet Word8 -> NonEmptyEnumStringSet Word8
forall a.
Enum a =>
NonEmpty a -> NonEmptyEnumStringSet a -> NonEmptyEnumStringSet a
NonEmptyEnumStringSet.insert
do (Word8
0xF0 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
c') Word8 -> [Word8] -> NonEmpty Word8
forall a. a -> [a] -> NonEmpty a
:| [Word8]
l
do NonEmptyEnumStringSet Word8
s
stringTails :: Int -> Int -> (Word8, [Word8])
stringTails :: Int -> Int -> (Word8, [Word8])
stringTails Int
c Int
n = Int -> [Word8] -> Int -> (Word8, [Word8])
forall t t a a.
(Integral t, Num t, Num a, Num a, Eq t) =>
t -> [a] -> t -> (a, [a])
stringTails' Int
c [] Int
n
stringTails' :: t -> [a] -> t -> (a, [a])
stringTails' t
c [a]
l = \case
t
0 -> (t -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral t
c, [a]
l)
t
n ->
let (t
c', t
x) = t -> t -> (t, t)
forall a. Integral a => a -> a -> (a, a)
quotRem t
c t
0x40
x' :: a
x' = t -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral do t
0x80 t -> t -> t
forall a. Num a => a -> a -> a
+ t
x
in t -> [a] -> t -> (a, [a])
stringTails' t
c'
do a
x' a -> [a] -> [a]
forall a. a -> [a] -> [a]
: [a]
l
do t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1
complementPFromEnumStrings
:: NonEmptyEnumStringSet.NonEmptyEnumStringSet Word8 -> Tlex.Pattern Word8
NonEmptyEnumStringSet Word8
ess0 = [Pattern] -> Pattern
forall e. Enum e => [Pattern e] -> Pattern e
Tlex.orP
[ [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr1es] []
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr2es, EnumSet Word8
seqes] [Pattern
seqesP]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr3p1o1es, EnumSet Word8
pr3p1o2es, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr3p1o1es
, EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr3p1o2es
, Pattern
seqesP
]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr3p2es, EnumSet Word8
seqes, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr3p2es
, Pattern
seqesP
, Pattern
seqesP
]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr3p3o1es, EnumSet Word8
pr3p3o2es, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr3p3o1es
, EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr3p3o2es
, Pattern
seqesP
]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr3p4es, EnumSet Word8
seqes, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr3p4es
, Pattern
seqesP
, Pattern
seqesP
]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr4p1o1es, EnumSet Word8
pr4p1o2es, EnumSet Word8
seqes, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr4p1o1es
, EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr4p1o2es
, Pattern
seqesP
, Pattern
seqesP
]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr4p2es, EnumSet Word8
seqes, EnumSet Word8
seqes, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr4p1o1es
, Pattern
seqesP
, Pattern
seqesP
, Pattern
seqesP
]
, [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8
pr4p3o1es, EnumSet Word8
pr4p3o2es, EnumSet Word8
seqes, EnumSet Word8
seqes]
[ EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr4p3o1es
, EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
pr4p3o2es
, Pattern
seqesP
, Pattern
seqesP
]
]
where
seqes :: EnumSet Word8
seqes = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0x80..Word8
0xBF]
seqesP :: Pattern
seqesP = EnumSet Word8 -> Pattern
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet Word8
seqes
pr1es :: EnumSet Word8
pr1es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0x00..Word8
0x7F]
pr2es :: EnumSet Word8
pr2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xC2..Word8
0xDF]
pr3p1o1es :: EnumSet Word8
pr3p1o1es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xE0]
pr3p1o2es :: EnumSet Word8
pr3p1o2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xA0..Word8
0xBF]
pr3p2es :: EnumSet Word8
pr3p2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xE1..Word8
0xEC]
pr3p3o1es :: EnumSet Word8
pr3p3o1es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xED]
pr3p3o2es :: EnumSet Word8
pr3p3o2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0x80..Word8
0x9F]
pr3p4es :: EnumSet Word8
pr3p4es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xEE..Word8
0xEF]
pr4p1o1es :: EnumSet Word8
pr4p1o1es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xF0]
pr4p1o2es :: EnumSet Word8
pr4p1o2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0x90..Word8
0xBF]
pr4p2es :: EnumSet Word8
pr4p2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xF1..Word8
0xF3]
pr4p3o1es :: EnumSet Word8
pr4p3o1es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0xF4]
pr4p3o2es :: EnumSet Word8
pr4p3o2es = [Word8] -> EnumSet Word8
forall k. Enum k => [k] -> EnumSet k
EnumSet.fromList @Word8 [Word8
0x80..Word8
0x8F]
go :: [EnumSet Word8] -> [Pattern] -> Pattern
go [EnumSet Word8]
bess [Pattern]
restPs = [EnumSet Word8]
-> [Pattern] -> NonEmptyEnumStringSet Word8 -> Pattern
forall e.
Enum e =>
[EnumSet e] -> [Pattern e] -> NonEmptyEnumStringSet e -> Pattern e
go' [EnumSet Word8]
bess [Pattern]
restPs NonEmptyEnumStringSet Word8
ess0
go' :: [EnumSet e] -> [Pattern e] -> NonEmptyEnumStringSet e -> Pattern e
go' [EnumSet e]
bess [Pattern e]
restPs NonEmptyEnumStringSet e
ess = case [EnumSet e]
bess of
[] -> Pattern e
forall a. Monoid a => a
mempty
[EnumSet e
bes] -> EnumSet e -> Pattern e
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP
do EnumSet e
bes EnumSet e -> EnumSet e -> EnumSet e
forall k. EnumSet k -> EnumSet k -> EnumSet k
`EnumSet.difference` NonEmptyEnumStringSet e -> EnumSet e
forall a. NonEmptyEnumStringSet a -> EnumSet a
NonEmptyEnumStringSet.singleEnums NonEmptyEnumStringSet e
ess
EnumSet e
bes:[EnumSet e]
bess2 ->
let mess :: EnumMap e (NonEmptyEnumStringSet e)
mess = NonEmptyEnumStringSet e -> EnumMap e (NonEmptyEnumStringSet e)
forall a.
NonEmptyEnumStringSet a -> EnumMap a (NonEmptyEnumStringSet a)
NonEmptyEnumStringSet.enumStrings NonEmptyEnumStringSet e
ess
(EnumSet e
nes, EnumSet e
ces) = (e -> Bool) -> EnumSet e -> (EnumSet e, EnumSet e)
forall k.
Enum k =>
(k -> Bool) -> EnumSet k -> (EnumSet k, EnumSet k)
EnumSet.partition
do \e
be -> e -> EnumMap e (NonEmptyEnumStringSet e) -> Bool
forall k a. Enum k => k -> EnumMap k a -> Bool
EnumMap.member e
be EnumMap e (NonEmptyEnumStringSet e)
mess
EnumSet e
bes
cesP :: Pattern e
cesP = EnumSet e -> Pattern e
forall e. Enum e => EnumSet e -> Pattern e
Tlex.straightEnumSetP EnumSet e
ces Pattern e -> Pattern e -> Pattern e
forall a. Semigroup a => a -> a -> a
<> [Pattern e] -> Pattern e
forall a. Monoid a => [a] -> a
mconcat [Pattern e]
restPs
in [Pattern e] -> Pattern e
forall e. Enum e => [Pattern e] -> Pattern e
Tlex.orP do
Pattern e
cesPPattern e -> [Pattern e] -> [Pattern e]
forall a. a -> [a] -> [a]
:
[ [EnumSet e] -> [Pattern e] -> NonEmptyEnumStringSet e -> Pattern e
go' [EnumSet e]
bess2 [Pattern e]
nrestPs NonEmptyEnumStringSet e
ness
| e
ne <- EnumSet e -> [e]
forall k. Enum k => EnumSet k -> [k]
EnumSet.toList EnumSet e
nes
, let ness :: NonEmptyEnumStringSet e
ness = case e
-> EnumMap e (NonEmptyEnumStringSet e)
-> Maybe (NonEmptyEnumStringSet e)
forall k a. Enum k => k -> EnumMap k a -> Maybe a
EnumMap.lookup e
ne EnumMap e (NonEmptyEnumStringSet e)
mess of
Just NonEmptyEnumStringSet e
x -> NonEmptyEnumStringSet e
x
Maybe (NonEmptyEnumStringSet e)
Nothing -> [Char] -> NonEmptyEnumStringSet e
forall a. HasCallStack => [Char] -> a
error [Char]
"unreachable"
, let nrestPs :: [Pattern e]
nrestPs = case [Pattern e]
restPs of
[] -> []
Pattern e
_:[Pattern e]
xs -> [Pattern e]
xs
]