{-|
Description:    Character translation functions to and from the Big5 encoding scheme.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.Big5
    ( decoder
    , encoder
    ) where


import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Tuple as U
import qualified Data.Word as W

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch


-- | __Encoding:__
--      @[Big5 decoder]
--      (https://encoding.spec.whatwg.org/#big5-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with the 'Big5' encoding
-- scheme, or returns 'Left' if the stream starts with an invalid byte
-- sequence.
decoder :: TextBuilder
decoder :: TextBuilder
decoder = StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Word8
   (StateT (Confidence, ()) (Parser ByteString))
   (DecoderError String)]
-> Word8 -> TextBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If Word8 -> Bool
isAsciiByte Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
toUnicode1
    , (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x81 Word8
0xFE) Word8 -> TextBuilder
decoder'
    , (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
decoderFailure1
    ]

-- | __Encoding:__
--      @[Big5 decoder]
--      (https://encoding.spec.whatwg.org/#big5-decoder)@
--      step 3
-- 
-- Process a double-byte sequence according to the 'Big5' encoding scheme.
decoder' :: W.Word8 -> TextBuilder
decoder' :: Word8 -> TextBuilder
decoder' Word8
first = StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Word8
   (StateT (Confidence, ()) (Parser ByteString))
   (DecoderError String)]
-> Word8 -> TextBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x40 Word8
0x7E) ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ Word -> Word8 -> Word8 -> TextBuilder
decodeChar Word
0x40 Word8
first
    , (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0xA1 Word8
0xFE) ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ Word -> Word8 -> Word8 -> TextBuilder
decodeChar Word
0x62 Word8
first
    , (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If Word8 -> Bool
isAsciiByte ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ \Word8
second -> Word8 -> StateT (Confidence, ()) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
second StateT (Confidence, ()) (Parser ByteString) ()
-> TextBuilder -> TextBuilder
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
decoderFailure1 Word8
first
    , (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> TextBuilder
forall state. Word8 -> Word8 -> StateTextBuilder state
decoderFailure2 Word8
first
    ]

-- | __Encoding:__
--      @[Big5 decoder]
--      (https://encoding.spec.whatwg.org/#big5-decoder)@
--      step 3, substeps 1-5
-- 
-- Given the two bytes of a multi-byte encoded character, try to retrieve their
-- associated textual representation.
decodeChar :: Word -> W.Word8 -> W.Word8 -> TextBuilder
decodeChar :: Word -> Word8 -> Word8 -> TextBuilder
decodeChar Word
offset Word8
first Word8
second = case Word
pointer of
    Word
1133 -> [Word8] -> String -> TextBuilder
forall state. [Word8] -> String -> StateTextBuilder state
emit' [Word8]
bs String
"\xCA\x0304"
    Word
1135 -> [Word8] -> String -> TextBuilder
forall state. [Word8] -> String -> StateTextBuilder state
emit' [Word8]
bs String
"\xCA\x030C"
    Word
1164 -> [Word8] -> String -> TextBuilder
forall state. [Word8] -> String -> StateTextBuilder state
emit' [Word8]
bs String
"\xEA\x0304"
    Word
1166 -> [Word8] -> String -> TextBuilder
forall state. [Word8] -> String -> StateTextBuilder state
emit' [Word8]
bs String
"\xEA\x030C"
    Word
_ -> TextBuilder -> (Char -> TextBuilder) -> Maybe Char -> TextBuilder
forall b a. b -> (a -> b) -> Maybe a -> b
maybe TextBuilder
forall state.
StateT
  (Confidence, state) (Parser ByteString) (DecoderError String)
failure ([Word8] -> Char -> TextBuilder
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8]
bs) (Maybe Char -> TextBuilder) -> Maybe Char -> TextBuilder
forall a b. (a -> b) -> a -> b
$ Word -> Maybe Char
decodeIndex Word
pointer
  where pointer :: Word
pointer = (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
first Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x81) Word -> Word -> Word
forall a. Num a => a -> a -> a
* Word
157 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
second Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
offset
        failure :: StateT
  (Confidence, state) (Parser ByteString) (DecoderError String)
failure
            | Word8 -> Bool
isAsciiByte Word8
second = Word8 -> StateT (Confidence, state) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
second StateT (Confidence, state) (Parser ByteString) ()
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall state. Word8 -> StateTextBuilder state
decoderFailure1 Word8
first
            | Bool
otherwise = [Word8]
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8]
bs
        bs :: [Word8]
bs = [Word8
first, Word8
second]


-- | __Encoding:__
--      @[Big5 encoder]
--      (https://encoding.spec.whatwg.org/#big5-encoder)@
-- 
-- Encode the first 'Char' in a string according to the 'Big5' encoding scheme,
-- or return that same character if that scheme doesn't define a binary
-- representation for it.
encoder :: BinaryBuilder
encoder :: BinaryBuilder
encoder = StateT () (Parser Text) Char
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT () (Parser Text) Char
-> (Char -> BinaryBuilder) -> BinaryBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Char (StateT () (Parser Text)) (EncoderError ShortByteString)]
-> Char -> BinaryBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If Char -> Bool
C.isAscii Char -> BinaryBuilder
forall state. Char -> StateBinaryBuilder state
fromAscii
    , (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else ((Char -> BinaryBuilder)
 -> SwitchCase
      Char (StateT () (Parser Text)) (EncoderError ShortByteString))
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall a b. (a -> b) -> a -> b
$ \Char
c -> BinaryBuilder
-> (Word -> BinaryBuilder) -> Maybe Word -> BinaryBuilder
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Char -> BinaryBuilder
forall state out. Char -> StateEncoder state (EncoderError out)
encoderFailure Char
c) Word -> BinaryBuilder
encoder' (Maybe Word -> BinaryBuilder) -> Maybe Word -> BinaryBuilder
forall a b. (a -> b) -> a -> b
$ Char -> Maybe Word
encodeIndex Char
c
    ]

-- | __Encoding:__
--      @[Big5 decoder]
--      (https://encoding.spec.whatwg.org/#big5-decoder)@
--      steps 5-8
-- 
-- Given an index pointer, calculate and return its binary representation.
-- Always returns a @'Right'@ value.
encoder' :: Word -> BinaryBuilder
encoder' :: Word -> BinaryBuilder
encoder' Word
index = EncoderError ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (EncoderError ShortByteString -> BinaryBuilder)
-> (ShortByteString -> EncoderError ShortByteString)
-> ShortByteString
-> BinaryBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> EncoderError ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> BinaryBuilder)
-> ShortByteString -> BinaryBuilder
forall a b. (a -> b) -> a -> b
$ [Word8] -> ShortByteString
BS.SH.pack
    [Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
lead Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x81, Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
trail Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
offset]
  where (Word
lead, Word
trail) = Word -> Word -> (Word, Word)
forall a. Integral a => a -> a -> (a, a)
divMod Word
index Word
157
        offset :: Word8
offset = if Word
trail Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
< Word
0x3F then Word8
0x40 else Word8
0x62


-- | Look for a character in the 'Big5' encoding at the given index.
decodeIndex :: Word -> Maybe Char
decodeIndex :: Word -> Maybe Char
decodeIndex Word
index = IO (MemoizationTable Word (Maybe Char))
-> Maybe (IO (MemoizationTable Char (Maybe Word)))
-> Word
-> (Word -> Maybe Char)
-> Maybe Char
forall k v.
(Eq k, Hashable k, Eq v, Hashable v) =>
IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> k
-> (k -> Maybe v)
-> Maybe v
lookupMemoizedIndex IO (MemoizationTable Word (Maybe Char))
decodeIndexM (IO (MemoizationTable Char (Maybe Word))
-> Maybe (IO (MemoizationTable Char (Maybe Word)))
forall a. a -> Maybe a
Just IO (MemoizationTable Char (Maybe Word))
encodeIndexM) Word
index Word -> Maybe Char
readDecodeIndex

-- | Memoization table to save lookup time in the over-large 'Big5' index.
decodeIndexM :: DecoderMemoTable
decodeIndexM :: IO (MemoizationTable Word (Maybe Char))
decodeIndexM = IO (MemoizationTable Word (Maybe Char))
forall k v. IO (MemoizationTable k v)
newMemoizationTable
{-# NOINLINE decodeIndexM #-}

-- | Read the character at a given offset from the 'Big5' index.  Note that
-- this is a heavy function, and should be cached whenever possible.
readDecodeIndex :: Word -> Maybe Char
readDecodeIndex :: Word -> Maybe Char
readDecodeIndex Word
index = Word -> [(Word, Char)] -> Maybe Char
forall k v. Ord k => k -> [(k, v)] -> Maybe v
search Word
index ([(Word, Char)] -> Maybe Char) -> [(Word, Char)] -> Maybe Char
forall a b. (a -> b) -> a -> b
$ String -> [(Word, Char)]
loadIndex String
"big5"


-- | Given a character, try to find the index value corresponding to it in the
-- 'Big5' encoding scheme.
encodeIndex :: Char -> Maybe Word
encodeIndex :: Char -> Maybe Word
encodeIndex Char
char = IO (MemoizationTable Char (Maybe Word))
-> Maybe (IO (MemoizationTable Word (Maybe Char)))
-> Char
-> (Char -> Maybe Word)
-> Maybe Word
forall k v.
(Eq k, Hashable k, Eq v, Hashable v) =>
IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> k
-> (k -> Maybe v)
-> Maybe v
lookupMemoizedIndex IO (MemoizationTable Char (Maybe Word))
encodeIndexM (IO (MemoizationTable Word (Maybe Char))
-> Maybe (IO (MemoizationTable Word (Maybe Char)))
forall a. a -> Maybe a
Just IO (MemoizationTable Word (Maybe Char))
decodeIndexM) Char
char Char -> Maybe Word
readEncodeIndex

-- | Memoization table to save lookup time in the over-large 'Big5' index.
encodeIndexM :: EncoderMemoTable
encodeIndexM :: IO (MemoizationTable Char (Maybe Word))
encodeIndexM = IO (MemoizationTable Char (Maybe Word))
forall k v. IO (MemoizationTable k v)
newMemoizationTable
{-# NOINLINE encodeIndexM #-}

-- | __Encoding:__
--      @[index Big5 pointer]
--      (https://encoding.spec.whatwg.org/#index-big5-pointer)@
-- 
-- Find the offset of a given character in the 'Big5' index.  Note that this
-- is a heavy function, and should be cached whenever possible.
readEncodeIndex :: Char -> Maybe Word
-- Hard code the exceptions as the algorithm abstraction makes reversing the
-- search order just for these impractical.
readEncodeIndex :: Char -> Maybe Word
readEncodeIndex Char
'\x2550' = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
18991
readEncodeIndex Char
'\x255E' = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
18975
readEncodeIndex Char
'\x2561' = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
18977
readEncodeIndex Char
'\x256A' = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
18976
readEncodeIndex Char
'\x5341' = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
5512
readEncodeIndex Char
'\x5345' = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
5599
readEncodeIndex Char
char = Char -> [(Char, Word)] -> Maybe Word
forall k v. Ord k => k -> [(k, v)] -> Maybe v
search Char
char ([(Char, Word)] -> Maybe Word)
-> ([(Word, Char)] -> [(Char, Word)])
-> [(Word, Char)]
-> Maybe Word
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Word, Char) -> (Char, Word)) -> [(Word, Char)] -> [(Char, Word)]
forall a b. (a -> b) -> [a] -> [b]
map (Word, Char) -> (Char, Word)
forall a b. (a, b) -> (b, a)
U.swap ([(Word, Char)] -> Maybe Word) -> [(Word, Char)] -> Maybe Word
forall a b. (a -> b) -> a -> b
$ ((Word, Char) -> Bool) -> String -> [(Word, Char)]
loadIndex' (Word, Char) -> Bool
forall a b. (Ord a, Num a) => (a, b) -> Bool
filterIndex String
"big5"
  where filterIndex :: (a, b) -> Bool
filterIndex (a
i, b
_) = a
i a -> a -> Bool
forall a. Ord a => a -> a -> Bool
>= a
0x20 a -> a -> a
forall a. Num a => a -> a -> a
* a
157