module Web.Willow.Common.Encoding.Utf8
( decoder
, encoder
, byteOrderMark
) where
import qualified Control.Applicative as A
import qualified Data.Bifunctor as F.B
import qualified Data.Bits as B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Word as W
import Data.Functor ( ($>) )
import Data.Bits ( (.&.), (.|.) )
import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
byteOrderMark
:: (A.Alternative gather, Monad gather)
=> ParserT BS.ByteString gather Encoding
byteOrderMark :: ParserT ByteString gather Encoding
byteOrderMark = ByteString -> ParserT ByteString gather ByteString
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq stream) =>
stream -> trans stream
chunk ([Word8] -> ByteString
BS.pack [Word8
0xEF, Word8
0xBB, Word8
0xBF]) ParserT ByteString gather ByteString
-> Encoding -> ParserT ByteString gather Encoding
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Encoding
Utf8
decoder :: TextBuilder
decoder :: TextBuilder
decoder = do
Word8
byte <- StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
Either ShortByteString Word
c <- Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
decoder' (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte
case Either ShortByteString Word
c of
Left ShortByteString
err -> [Word8] -> TextBuilder
forall state. [Word8] -> StateTextBuilder state
decoderFailure ([Word8] -> TextBuilder)
-> ([Word8] -> [Word8]) -> [Word8] -> TextBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8
byte Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
:) ([Word8] -> TextBuilder) -> [Word8] -> TextBuilder
forall a b. (a -> b) -> a -> b
$ ShortByteString -> [Word8]
BS.SH.unpack ShortByteString
err
Right Word
code -> [Word8] -> Char -> TextBuilder
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
byte] (Char -> TextBuilder) -> (Int -> Char) -> Int -> TextBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> TextBuilder) -> Int -> TextBuilder
forall a b. (a -> b) -> a -> b
$ Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
code
where decoder' :: Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
decoder' Word
b
| Word
b Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0x7F = Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall (m :: * -> *) a. Monad m => a -> m a
return (Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word -> Either ShortByteString Word
forall a b. b -> Either a b
Right Word
b
| Word
b Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0xC1 = Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall (m :: * -> *) a. Monad m => a -> m a
return (Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ ShortByteString -> Either ShortByteString Word
forall a b. a -> Either a b
Left ShortByteString
BS.SH.empty
| Word
b Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0xDF = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
1 Word8
defL Word8
defH (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x1F
| Word
b Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0xE0 = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
2 Word8
0xA0 Word8
defH (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0xF
| Word
b Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0xED = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
2 Word8
defL Word8
0x9F (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0xF
| Word
b Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0xEF = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
2 Word8
defL Word8
defH (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0xF
| Word
b Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0xF0 = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
3 Word8
0x90 Word8
defH (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x7
| Word
b Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0xF3 = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
3 Word8
defL Word8
defH (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x7
| Word
b Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0xF4 = StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall c.
StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError (StateT (Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
3 Word8
defL Word8
0x8F (Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ Word
b Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x7
| Bool
otherwise = Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall (m :: * -> *) a. Monad m => a -> m a
return (Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word))
-> Either ShortByteString Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString Word)
forall a b. (a -> b) -> a -> b
$ ShortByteString -> Either ShortByteString Word
forall a b. a -> Either a b
Left ShortByteString
BS.SH.empty
defL :: Word8
defL = Word8
0x80
defH :: Word8
defH = Word8
0xBF
packError :: StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
packError = (Either [Word8] c -> Either ShortByteString c)
-> StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Either [Word8] c -> Either ShortByteString c)
-> StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c))
-> (Either [Word8] c -> Either ShortByteString c)
-> StateT (Confidence, ()) (Parser ByteString) (Either [Word8] c)
-> StateT
(Confidence, ()) (Parser ByteString) (Either ShortByteString c)
forall a b. (a -> b) -> a -> b
$ ([Word8] -> ShortByteString)
-> Either [Word8] c -> Either ShortByteString c
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first [Word8] -> ShortByteString
BS.SH.pack
decodeChar
:: Word
-> W.Word8
-> W.Word8
-> Word
-> Decoder (Either [W.Word8] Word)
decodeChar :: Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar Word
0 Word8
_ Word8
_ Word
code = Either [Word8] Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
forall (m :: * -> *) a. Monad m => a -> m a
return (Either [Word8] Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word))
-> Either [Word8] Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
forall a b. (a -> b) -> a -> b
$ Word -> Either [Word8] Word
forall a b. b -> Either a b
Right Word
code
decodeChar Word
len Word8
low Word8
high Word
code = do
Word8
byte <- StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
if Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
low Word8
high Word8
byte
then ([Word8] -> [Word8]) -> Either [Word8] Word -> Either [Word8] Word
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first (Word8
byte Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
:) (Either [Word8] Word -> Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
Word
-> Word8
-> Word8
-> Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
decodeChar (Word
len Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
1) Word8
0x80 Word8
0xBF (Word -> Int -> Word
forall a. Bits a => a -> Int -> a
B.shiftL Word
code Int
6 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F))
else Word8 -> StateT (Confidence, ()) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
byte StateT (Confidence, ()) (Parser ByteString) ()
-> Either [Word8] Word
-> StateT
(Confidence, ()) (Parser ByteString) (Either [Word8] Word)
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> [Word8] -> Either [Word8] Word
forall a b. a -> Either a b
Left []
encoder :: BinaryBuilder
encoder :: BinaryBuilder
encoder = do
Char
char <- StateT () (Parser Text) Char
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
(Int
len, Int
offset) <- Char -> StateT () (Parser Text) (Int, Int)
forall (f :: * -> *) a b.
(Num a, Num b, Alternative f) =>
Char -> f (a, b)
countOffset Char
char
let first :: Word8
first = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> Int -> Word8
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Bits a => a -> Int -> a
B.shiftR (Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
char) (Int
6 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
len) Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
offset
Either Char ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (Either Char ShortByteString -> BinaryBuilder)
-> ([Word8] -> Either Char ShortByteString)
-> [Word8]
-> BinaryBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> Either Char ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> Either Char ShortByteString)
-> ([Word8] -> ShortByteString)
-> [Word8]
-> Either Char ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ShortByteString
BS.SH.pack ([Word8] -> BinaryBuilder) -> [Word8] -> BinaryBuilder
forall a b. (a -> b) -> a -> b
$ Word8
first Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: Int -> Int -> [Word8]
forall t a. (Integral t, Bits t, Num a) => t -> Int -> [a]
trail (Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
char) Int
len
where countOffset :: Char -> f (a, b)
countOffset Char
char
| Char -> Bool
C.isAscii Char
char = (a, b) -> f (a, b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a
0, b
0)
| Char
char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x07FF' = (a, b) -> f (a, b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a
1, b
0xC0)
| Char
char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFFFF' = (a, b) -> f (a, b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a
2, b
0xE0)
| Char
char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x10FFFF' = (a, b) -> f (a, b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a
3, b
0xF0)
| Bool
otherwise = f (a, b)
forall (f :: * -> *) a. Alternative f => f a
A.empty
trail :: t -> Int -> [a]
trail t
_ Int
0 = []
trail t
code Int
count = t -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (t
0x80 t -> t -> t
forall a. Bits a => a -> a -> a
.|. (t -> Int -> t
forall a. Bits a => a -> Int -> a
B.shiftR t
code (Int
6 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)) t -> t -> t
forall a. Bits a => a -> a -> a
.&. t
0x3F)) a -> [a] -> [a]
forall a. a -> [a] -> [a]
:
t -> Int -> [a]
trail t
code (Int
count Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)