{-| Description: Character translation functions to and from the Big5 encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.Big5 ( decoder , encoder ) where import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Tuple as U import qualified Data.Word as W import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch -- | __Encoding:__ -- @[Big5 decoder] -- (https://encoding.spec.whatwg.org/#big5-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'Big5' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: TextBuilder decoder = next >>= switch [ If isAsciiByte toUnicode1 , If (range 0x81 0xFE) decoder' , Else decoderFailure1 ] -- | __Encoding:__ -- @[Big5 decoder] -- (https://encoding.spec.whatwg.org/#big5-decoder)@ -- step 3 -- -- Process a double-byte sequence according to the 'Big5' encoding scheme. decoder' :: W.Word8 -> TextBuilder decoder' first = next >>= switch [ If (range 0x40 0x7E) $ decodeChar 0x40 first , If (range 0xA1 0xFE) $ decodeChar 0x62 first , If isAsciiByte $ \second -> push second *> decoderFailure1 first , Else $ decoderFailure2 first ] -- | __Encoding:__ -- @[Big5 decoder] -- (https://encoding.spec.whatwg.org/#big5-decoder)@ -- step 3, substeps 1-5 -- -- Given the two bytes of a multi-byte encoded character, try to retrieve their -- associated textual representation. decodeChar :: Word -> W.Word8 -> W.Word8 -> TextBuilder decodeChar offset first second = case pointer of 1133 -> emit' bs "\xCA\x0304" 1135 -> emit' bs "\xCA\x030C" 1164 -> emit' bs "\xEA\x0304" 1166 -> emit' bs "\xEA\x030C" _ -> maybe failure (emit bs) $ decodeIndex pointer where pointer = (fromIntegral first - 0x81) * 157 + fromIntegral second - offset failure | isAsciiByte second = push second *> decoderFailure1 first | otherwise = decoderFailure bs bs = [first, second] -- | __Encoding:__ -- @[Big5 encoder] -- (https://encoding.spec.whatwg.org/#big5-encoder)@ -- -- Encode the first 'Char' in a string according to the 'Big5' encoding scheme, -- or return that same character if that scheme doesn't define a binary -- representation for it. encoder :: BinaryBuilder encoder = next >>= switch [ If C.isAscii fromAscii , Else $ \c -> maybe (encoderFailure c) encoder' $ encodeIndex c ] -- | __Encoding:__ -- @[Big5 decoder] -- (https://encoding.spec.whatwg.org/#big5-decoder)@ -- steps 5-8 -- -- Given an index pointer, calculate and return its binary representation. -- Always returns a @'Right'@ value. encoder' :: Word -> BinaryBuilder encoder' index = return . pure $ BS.SH.pack [fromIntegral lead + 0x81, fromIntegral trail + offset] where (lead, trail) = divMod index 157 offset = if trail < 0x3F then 0x40 else 0x62 -- | Look for a character in the 'Big5' encoding at the given index. decodeIndex :: Word -> Maybe Char decodeIndex index = lookupMemoizedIndex decodeIndexM (Just encodeIndexM) index readDecodeIndex -- | Memoization table to save lookup time in the over-large 'Big5' index. decodeIndexM :: DecoderMemoTable decodeIndexM = newMemoizationTable {-# NOINLINE decodeIndexM #-} -- | Read the character at a given offset from the 'Big5' index. Note that -- this is a heavy function, and should be cached whenever possible. readDecodeIndex :: Word -> Maybe Char readDecodeIndex index = search index $ loadIndex "big5" -- | Given a character, try to find the index value corresponding to it in the -- 'Big5' encoding scheme. encodeIndex :: Char -> Maybe Word encodeIndex char = lookupMemoizedIndex encodeIndexM (Just decodeIndexM) char readEncodeIndex -- | Memoization table to save lookup time in the over-large 'Big5' index. encodeIndexM :: EncoderMemoTable encodeIndexM = newMemoizationTable {-# NOINLINE encodeIndexM #-} -- | __Encoding:__ -- @[index Big5 pointer] -- (https://encoding.spec.whatwg.org/#index-big5-pointer)@ -- -- Find the offset of a given character in the 'Big5' index. Note that this -- is a heavy function, and should be cached whenever possible. readEncodeIndex :: Char -> Maybe Word -- Hard code the exceptions as the algorithm abstraction makes reversing the -- search order just for these impractical. readEncodeIndex '\x2550' = Just 18991 readEncodeIndex '\x255E' = Just 18975 readEncodeIndex '\x2561' = Just 18977 readEncodeIndex '\x256A' = Just 18976 readEncodeIndex '\x5341' = Just 5512 readEncodeIndex '\x5345' = Just 5599 readEncodeIndex char = search char . map U.swap $ loadIndex' filterIndex "big5" where filterIndex (i, _) = i >= 0x20 * 157