{-|
Module:     Codec.Parser.UTF8
Copyright:  Jeremy List
License:    BSD-3
Maintainer: quick.dudley@gmail.com

'Phase's for decoding bytes to characters using UTF-8
-}
module Codec.Phaser.UTF8 (
  utf8_char,
  utf8_stream,
  utf8_encode
 ) where

import Data.Bits
import Data.Word
import Data.List
import Control.Monad
import Control.Applicative

import Codec.Phaser.Core

-- | Consume a UTF-8 character from a stream of bytes and return it. Fail on
-- invalid UTF-8.
utf8_char :: (Monoid p) => Phase p Word8 o Char
utf8_char :: forall p o. Monoid p => Phase p Word8 o Char
utf8_char = do
  Int
c1 <- (Word8 -> Int) -> Phase p Word8 o Word8 -> Phase p Word8 o Int
forall a b. (a -> b) -> Phase p Word8 o a -> Phase p Word8 o b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Phase p Word8 o Word8
forall p i o. Phase p i o i
get
  case () of
   ()
_ | Int
c1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x80 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 -> Char -> Phase p Word8 o Char
forall a. a -> Phase p Word8 o a
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Phase p Word8 o Char) -> Char -> Phase p Word8 o Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
forall a. Enum a => Int -> a
toEnum Int
c1
     | Int
c1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x40 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 -> String -> Phase p Word8 o Char
forall a. String -> Phase p Word8 o a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"UTF-8 codepoint missing initial byte"
     | Int -> Int
forall a. Bits a => a -> a
complement Int
c1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x38 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 ->
        String -> Phase p Word8 o Char
forall a. String -> Phase p Word8 o a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Invalid UTF-8 codepoint initial byte"
     | Bool
otherwise -> Int -> Int -> Phase p Word8 o Char
forall {a} {p} {b} {o}.
(Integral a, Monoid p, Enum b) =>
Int -> Int -> Phase p a o b
go Int
0x20 (Int
c1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
 where
  go :: Int -> Int -> Phase p a o b
go Int
z Int
a = do
    Int
c2 <- String
"Incomplete UTF-8 codepoint" String -> Phase p a o Int -> Phase p a o Int
forall {p} {i} {o} {a}. String -> Phase p i o a -> Phase p i o a
<?> do
      Int
c <- (a -> Int) -> Phase p a o a -> Phase p a o Int
forall a b. (a -> b) -> Phase p a o a -> Phase p a o b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Phase p a o a
forall p i o. Phase p i o i
get
      Bool -> Phase p a o ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Phase p a o ()) -> Bool -> Phase p a o ()
forall a b. (a -> b) -> a -> b
$ Int
c Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0xc0 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0x80
      Int -> Phase p a o Int
forall a. a -> Phase p a o a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
c
    if (Int
a Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
z) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
      then b -> Phase p a o b
forall a. a -> Phase p a o a
forall (m :: * -> *) a. Monad m => a -> m a
return (b -> Phase p a o b) -> b -> Phase p a o b
forall a b. (a -> b) -> a -> b
$ Int -> b
forall a. Enum a => Int -> a
toEnum (Int -> b) -> Int -> b
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftL Int
a Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
c2 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
      else Int -> Int -> Phase p a o b
go (Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftL Int
z Int
5) (Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftL (Int
a Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement Int
z) Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
c2 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F))

-- | Consume any number of UTF-8 characters and yield them.
utf8_stream :: (Monoid p) => Phase p Word8 Char ()
utf8_stream :: forall p. Monoid p => Phase p Word8 Char ()
utf8_stream = (Phase p Word8 Char Char
forall p o. Monoid p => Phase p Word8 o Char
utf8_char Phase p Word8 Char Char
-> (Char -> Phase p Word8 Char ()) -> Phase p Word8 Char ()
forall a b.
Phase p Word8 Char a
-> (a -> Phase p Word8 Char b) -> Phase p Word8 Char b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Char -> Phase p Word8 Char ()
forall o p i. o -> Phase p i o ()
yield Phase p Word8 Char ()
-> Phase p Word8 Char () -> Phase p Word8 Char ()
forall a b.
Phase p Word8 Char a
-> Phase p Word8 Char b -> Phase p Word8 Char b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Phase p Word8 Char ()
forall p. Monoid p => Phase p Word8 Char ()
utf8_stream) Phase p Word8 Char ()
-> Phase p Word8 Char () -> Phase p Word8 Char ()
forall a.
Phase p Word8 Char a
-> Phase p Word8 Char a -> Phase p Word8 Char a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Phase p Word8 Char ()
forall a. a -> Phase p Word8 Char a
forall (m :: * -> *) a. Monad m => a -> m a
return ()

-- | Consume any number of Characters and yield them as UTF-8 bytes
utf8_encode :: (Monoid p) => Phase p Char Word8 ()
utf8_encode :: forall p. Monoid p => Phase p Char Word8 ()
utf8_encode = (Char -> Int
forall a. Enum a => a -> Int
fromEnum (Char -> Int) -> Phase p Char Word8 Char -> Phase p Char Word8 Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Phase p Char Word8 Char
forall p i o. Phase p i o i
get) Phase p Char Word8 Int
-> (Int -> Phase p Char Word8 ()) -> Phase p Char Word8 ()
forall a b.
Phase p Char Word8 a
-> (a -> Phase p Char Word8 b) -> Phase p Char Word8 b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Int
c -> if Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x80
  then Word8 -> Phase p Char Word8 ()
forall o p i. o -> Phase p i o ()
yield (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
c) Phase p Char Word8 ()
-> Phase p Char Word8 () -> Phase p Char Word8 ()
forall a b.
Phase p Char Word8 a
-> Phase p Char Word8 b -> Phase p Char Word8 b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Phase p Char Word8 ()
nxt
  else Word8 -> Word8 -> Int -> [Word8] -> Phase p Char Word8 ()
forall {t}.
(Integral t, Bits t) =>
Word8 -> Word8 -> t -> [Word8] -> Phase p Char Word8 ()
go Word8
0xC0 Word8
0x20 Int
c []
 where
  nxt :: Phase p Char Word8 ()
nxt = (Phase p Char Word8 ()
forall p. Monoid p => Phase p Char Word8 ()
utf8_encode Phase p Char Word8 ()
-> Phase p Char Word8 () -> Phase p Char Word8 ()
forall a.
Phase p Char Word8 a
-> Phase p Char Word8 a -> Phase p Char Word8 a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Phase p Char Word8 ()
forall a. a -> Phase p Char Word8 a
forall (m :: * -> *) a. Monad m => a -> m a
return ())
  go :: Word8 -> Word8 -> t -> [Word8] -> Phase p Char Word8 ()
go Word8
pfb Word8
fzb t
c' [Word8]
o = let
    l :: t
l = t -> Int -> t
forall a. Bits a => a -> Int -> a
shiftR t
c' Int
6
    m :: Word8
m = t -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (t
c' t -> t -> t
forall a. Bits a => a -> a -> a
.&. t
0x3F)
    m' :: Word8
m' = Word8
m Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. Word8
0x80
    in if t
l t -> t -> Bool
forall a. Ord a => a -> a -> Bool
< Word8 -> t
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
fzb
      then (Word8 -> Phase p Char Word8 () -> Phase p Char Word8 ())
-> Phase p Char Word8 () -> [Word8] -> Phase p Char Word8 ()
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\Word8
b Phase p Char Word8 ()
r -> Word8 -> Phase p Char Word8 ()
forall o p i. o -> Phase p i o ()
yield Word8
b Phase p Char Word8 ()
-> Phase p Char Word8 () -> Phase p Char Word8 ()
forall a b.
Phase p Char Word8 a
-> Phase p Char Word8 b -> Phase p Char Word8 b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Phase p Char Word8 ()
r) Phase p Char Word8 ()
nxt ([Word8] -> Phase p Char Word8 ())
-> [Word8] -> Phase p Char Word8 ()
forall a b. (a -> b) -> a -> b
$
        (Word8
pfb Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. t -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral t
l) Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: Word8
m' Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: [Word8]
o
      else Word8 -> Word8 -> t -> [Word8] -> Phase p Char Word8 ()
go (Word8
pfb Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. Word8
fzb) (Word8 -> Int -> Word8
forall a. Bits a => a -> Int -> a
shiftR Word8
fzb Int
1) t
l (Word8
m' Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: [Word8]
o)