{-# LANGUAGE CPP,DeriveDataTypeable #-} {- | GB18030 is a chinese character encoding that is mandatory in china (if you - don\'t implement it, you\'re not allowed to sell your software there). -} module Data.Encoding.GB18030 (GB18030(..)) where import Control.Throws import Data.Char (chr,ord) import Data.Word import Data.Bits import Data.Encoding.Base import Data.Encoding.ByteSource import Data.Encoding.ByteSink import Data.Encoding.Exception import Data.ByteString (ByteString) import qualified Data.ByteString as BS import Data.Typeable #if __GLASGOW_HASKELL__>=608 import Data.ByteString.Unsafe (unsafeIndex) #else import Data.ByteString.Base (unsafeIndex) #endif import Data.Encoding.GB18030Data data GB18030 = GB18030 deriving (Eq,Show,Typeable) instance Encoding GB18030 where decodeChar _ = do w1 <- fetchWord8 case () of _ | w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii | w1 <= 0xFE -> do w2 <- fetchWord8 case () of _ | w2 < 0x30 -> throwException (IllegalCharacter w2) | w2 <= 0x39 -> do w3 <- fetchWord8 case () of _ | w3 < 0x81 -> throwException (IllegalCharacter w3) | w3 <= 0xFE -> do w4 <- fetchWord8 case () of _ | w4 < 0x30 -> throwException (IllegalCharacter w4) | w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4 | otherwise -> throwException (IllegalCharacter w4) | otherwise -> throwException (IllegalCharacter w3) | w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2 | w2 == 0x7F -> throwException (IllegalCharacter w2) | w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2 | otherwise -> throwException (IllegalCharacter w2) | otherwise -> throwException (IllegalCharacter w1) {- How this works: The nested if-structures form an binary tree over the - encoding range. -} encodeChar _ ch = if ch<='\x4946' -- 1 then (if ch<='\x4055' -- 2 then (if ch<='\x2E80' -- 3 then (if ch<='\x200F' -- 4 then (if ch<'\x0452' then arr 0x0000 arr1 else range range1) else (if ch<'\x2643' then arr 0x2010 arr2 else range range2)) else (if ch<='\x3917' -- 4 then (if ch<'\x361B' then arr 0x2E81 arr3 else range range3) else (if ch<'\x3CE1' then arr 0x3918 arr4 else range range4))) else (if ch<='\x464B' -- 3 then (if ch<='\x4336' -- 4 then (if ch<'\x4160' then arr 0x4056 arr5 else range range5) else (if ch<'\x44D7' then arr 0x4337 arr6 else range range6)) else (if ch<'\x478E' then arr 0x464C arr7 else range range7))) else (if ch<='\xF92B' -- 2 then (if ch<='\xD7FF' -- 3 then (if ch<='\x4C76' -- 4 then (if ch<'\x49B8' then arr 0x4947 arr8 else range range8) else (if ch<'\x9FA6' then arr 0x4C77 arr9 else range range9)) else (if ch<'\xE865' then arr 0xD800 arr10 else range range10)) else (if ch<='\xFFFF' -- 3 then (if ch<='\xFE2F' -- 4 then (if ch<'\xFA2A' then arr 0xF92C arr11 else range range11) else (if ch<'\xFFE6' then arr 0xFE30 arr12 else range range12)) else (if ch<='\x10FFFF' -- 4 then range range13 else throwException (HasNoRepresentation ch)))) where range r = let (w1,w2,w3,w4) = delinear (ord ch + r) in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4 arr off a = let ind = (ord ch - off)*5 w1 = unsafeIndex a (ind+1) w2 = unsafeIndex a (ind+2) w3 = unsafeIndex a (ind+3) w4 = unsafeIndex a (ind+4) in do pushWord8 w1 case unsafeIndex a ind of 1 -> return () 2 -> pushWord8 w2 3 -> pushWord8 w2 >> pushWord8 w3 4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4 encodeable _ c = c <= '\x10FFFF' linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int linear w1 w2 w3 w4 = (fromIntegral (w4-0x30)) + (fromIntegral (w3-0x81))*10 + (fromIntegral (w2-0x30))*1260 + (fromIntegral (w1-0x81))*12600 linear2 :: Word8 -> Word8 -> Int linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E then 0x40 else 0x41))) + (fromIntegral (w1-0x81))*190 delinear :: Int -> (Word8,Word8,Word8,Word8) delinear n = let (w1,n1) = n `divMod` 12600 (w2,n2) = n1 `divMod` 1260 (w3,n3) = n2 `divMod` 10 w4 = n3 in (fromIntegral w1+0x81 ,fromIntegral w2+0x30 ,fromIntegral w3+0x81 ,fromIntegral w4+0x30) decodeGBTwo :: Int -> Char decodeGBTwo n = let rn = n*2 w1 = unsafeIndex rrarr rn w2 = unsafeIndex rrarr (rn+1) in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2) decodeGBFour :: ByteSource m => Int -> m Char decodeGBFour v = if v<=17858 -- 1 then (if v<=15582 -- 2 then (if v<=11328 -- 3 then (if v<=7921 -- 4 then (if v<820 then arr 0 rarr1 else range range1) else (if v<9219 then arr 7922 rarr2 else range range2)) else (if v<=13737 -- 4 then (if v<12973 then arr 11329 rarr3 else range range3) else (if v<14698 then arr 13738 rarr4 else range range4))) else (if v<=17101 -- 3 then (if v<=16317 -- 4 then (if v<15847 then arr 15583 rarr5 else range range5) else (if v<16729 then arr 16318 rarr6 else range range6)) else (if v<17418 then arr 17102 rarr7 else range range7))) else (if v<=37844 -- 2 then (if v<=33468 -- 3 then (if v<=18663 -- 4 then (if v<17961 then arr 17859 rarr8 else range range8) else (if v<19043 then arr 18664 rarr9 else range range9)) else (if v<33550 then arr 33469 rarr10 else range range10)) else (if v<=39419 -- 3 then (if v<=39107 -- 4 then (if v<38078 then arr 37845 rarr11 else range range11) else (if v<39394 then arr 39108 rarr12 else range range12)) else (if v<=1237575 && v>=189000 then range range13 else throwException OutOfRange))) where arr off a = let v' = (v-off)*2 w1 = unsafeIndex a v' w2 = unsafeIndex a (v'+1) in return $ chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2) range r = return $ chr (v-r) range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int range1 = -286 range2 = -576 range3 = -878 range4 = -887 range5 = -889 range6 = -894 range7 = -900 range8 = -911 range9 = -21827 range10 = -25943 range11 = -25964 range12 = -26116 range13 = 123464