{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-}
{-# OPTIONS_GHC -O #-}
module Encoding (
utf8DecodeChar#,
utf8PrevChar,
utf8CharStart,
utf8DecodeChar,
utf8DecodeByteString,
utf8DecodeStringLazy,
utf8EncodeChar,
utf8EncodeString,
utf8EncodedLength,
countUTF8Chars,
zEncodeString,
zDecodeString,
toBase62,
toBase62Padded
) where
import Foreign
import Foreign.ForeignPtr.Unsafe
import Data.Char
import qualified Data.Char as Char
import Numeric
import GHC.IO
import Data.ByteString (ByteString)
import qualified Data.ByteString.Internal as BS
import GHC.Exts
{-# INLINE utf8DecodeChar# #-}
utf8DecodeChar# :: Addr# -> (# Char#, Int# #)
utf8DecodeChar# a# =
let !ch0 = word2Int# (indexWord8OffAddr# a# 0#) in
case () of
_ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
| isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
(ch1 -# 0x80#)),
2# #)
| isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch2 -# 0x80#)),
3# #)
| isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
let !ch3 = word2Int# (indexWord8OffAddr# a# 3#) in
if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
(# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch3 -# 0x80#)),
4# #)
| otherwise -> fail 1#
where
fail :: Int# -> (# Char#, Int# #)
fail nBytes# = (# '\0'#, nBytes# #)
utf8DecodeChar :: Ptr Word8 -> (Char, Int)
utf8DecodeChar (Ptr a#) =
case utf8DecodeChar# a# of (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8)
utf8PrevChar p = utf8CharStart (p `plusPtr` (-1))
utf8CharStart :: Ptr Word8 -> IO (Ptr Word8)
utf8CharStart p = go p
where go p = do w <- peek p
if w >= 0x80 && w < 0xC0
then go (p `plusPtr` (-1))
else return p
utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS ptr offset len)
= utf8DecodeStringLazy ptr offset len
utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeStringLazy fptr offset len
= unsafeDupablePerformIO $ unpack start
where
!start = unsafeForeignPtrToPtr fptr `plusPtr` offset
!end = start `plusPtr` len
unpack p
| p >= end = touchForeignPtr fptr >> return []
| otherwise =
case utf8DecodeChar# (unPtr p) of
(# c#, nBytes# #) -> do
rest <- unsafeDupableInterleaveIO $ unpack (p `plusPtr#` nBytes#)
return (C# c# : rest)
countUTF8Chars :: Ptr Word8 -> Int -> IO Int
countUTF8Chars ptr len = go ptr 0
where
!end = ptr `plusPtr` len
go p !n
| p >= end = return n
| otherwise = do
case utf8DecodeChar# (unPtr p) of
(# _, nBytes# #) -> go (p `plusPtr#` nBytes#) (n+1)
unPtr :: Ptr a -> Addr#
unPtr (Ptr a) = a
plusPtr# :: Ptr a -> Int# -> Ptr a
plusPtr# ptr nBytes# = ptr `plusPtr` (I# nBytes#)
utf8EncodeChar :: Char -> Ptr Word8 -> IO (Ptr Word8)
utf8EncodeChar c ptr =
let x = ord c in
case () of
_ | x > 0 && x <= 0x007f -> do
poke ptr (fromIntegral x)
return (ptr `plusPtr` 1)
| x <= 0x07ff -> do
poke ptr (fromIntegral (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)))
pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x .&. 0x3F)))
return (ptr `plusPtr` 2)
| x <= 0xffff -> do
poke ptr (fromIntegral (0xE0 .|. (x `shiftR` 12) .&. 0x0F))
pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x `shiftR` 6) .&. 0x3F))
pokeElemOff ptr 2 (fromIntegral (0x80 .|. (x .&. 0x3F)))
return (ptr `plusPtr` 3)
| otherwise -> do
poke ptr (fromIntegral (0xF0 .|. (x `shiftR` 18)))
pokeElemOff ptr 1 (fromIntegral (0x80 .|. ((x `shiftR` 12) .&. 0x3F)))
pokeElemOff ptr 2 (fromIntegral (0x80 .|. ((x `shiftR` 6) .&. 0x3F)))
pokeElemOff ptr 3 (fromIntegral (0x80 .|. (x .&. 0x3F)))
return (ptr `plusPtr` 4)
utf8EncodeString :: Ptr Word8 -> String -> IO ()
utf8EncodeString ptr str = go ptr str
where go !_ [] = return ()
go ptr (c:cs) = do
ptr' <- utf8EncodeChar c ptr
go ptr' cs
utf8EncodedLength :: String -> Int
utf8EncodedLength str = go 0 str
where go !n [] = n
go n (c:cs)
| ord c > 0 && ord c <= 0x007f = go (n+1) cs
| ord c <= 0x07ff = go (n+2) cs
| ord c <= 0xffff = go (n+3) cs
| otherwise = go (n+4) cs
type UserString = String
type EncodedString = String
zEncodeString :: UserString -> EncodedString
zEncodeString cs = case maybe_tuple cs of
Just n -> n
Nothing -> go cs
where
go [] = []
go (c:cs) = encode_digit_ch c ++ go' cs
go' [] = []
go' (c:cs) = encode_ch c ++ go' cs
unencodedChar :: Char -> Bool
unencodedChar 'Z' = False
unencodedChar 'z' = False
unencodedChar c = c >= 'a' && c <= 'z'
|| c >= 'A' && c <= 'Z'
|| c >= '0' && c <= '9'
encode_digit_ch :: Char -> EncodedString
encode_digit_ch c | c >= '0' && c <= '9' = encode_as_unicode_char c
encode_digit_ch c | otherwise = encode_ch c
encode_ch :: Char -> EncodedString
encode_ch c | unencodedChar c = [c]
encode_ch '(' = "ZL"
encode_ch ')' = "ZR"
encode_ch '[' = "ZM"
encode_ch ']' = "ZN"
encode_ch ':' = "ZC"
encode_ch 'Z' = "ZZ"
encode_ch 'z' = "zz"
encode_ch '&' = "za"
encode_ch '|' = "zb"
encode_ch '^' = "zc"
encode_ch '$' = "zd"
encode_ch '=' = "ze"
encode_ch '>' = "zg"
encode_ch '#' = "zh"
encode_ch '.' = "zi"
encode_ch '<' = "zl"
encode_ch '-' = "zm"
encode_ch '!' = "zn"
encode_ch '+' = "zp"
encode_ch '\'' = "zq"
encode_ch '\\' = "zr"
encode_ch '/' = "zs"
encode_ch '*' = "zt"
encode_ch '_' = "zu"
encode_ch '%' = "zv"
encode_ch c = encode_as_unicode_char c
encode_as_unicode_char :: Char -> EncodedString
encode_as_unicode_char c = 'z' : if isDigit (head hex_str) then hex_str
else '0':hex_str
where hex_str = showHex (ord c) "U"
zDecodeString :: EncodedString -> UserString
zDecodeString [] = []
zDecodeString ('Z' : d : rest)
| isDigit d = decode_tuple d rest
| otherwise = decode_upper d : zDecodeString rest
zDecodeString ('z' : d : rest)
| isDigit d = decode_num_esc d rest
| otherwise = decode_lower d : zDecodeString rest
zDecodeString (c : rest) = c : zDecodeString rest
decode_upper, decode_lower :: Char -> Char
decode_upper 'L' = '('
decode_upper 'R' = ')'
decode_upper 'M' = '['
decode_upper 'N' = ']'
decode_upper 'C' = ':'
decode_upper 'Z' = 'Z'
decode_upper ch = ch
decode_lower 'z' = 'z'
decode_lower 'a' = '&'
decode_lower 'b' = '|'
decode_lower 'c' = '^'
decode_lower 'd' = '$'
decode_lower 'e' = '='
decode_lower 'g' = '>'
decode_lower 'h' = '#'
decode_lower 'i' = '.'
decode_lower 'l' = '<'
decode_lower 'm' = '-'
decode_lower 'n' = '!'
decode_lower 'p' = '+'
decode_lower 'q' = '\''
decode_lower 'r' = '\\'
decode_lower 's' = '/'
decode_lower 't' = '*'
decode_lower 'u' = '_'
decode_lower 'v' = '%'
decode_lower ch = ch
decode_num_esc :: Char -> EncodedString -> UserString
decode_num_esc d rest
= go (digitToInt d) rest
where
go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest
go n ('U' : rest) = chr n : zDecodeString rest
go n other = error ("decode_num_esc: " ++ show n ++ ' ':other)
decode_tuple :: Char -> EncodedString -> UserString
decode_tuple d rest
= go (digitToInt d) rest
where
go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest
go 0 ('T':rest) = "()" ++ zDecodeString rest
go n ('T':rest) = '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest
go 1 ('H':rest) = "(# #)" ++ zDecodeString rest
go n ('H':rest) = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest
go n other = error ("decode_tuple: " ++ show n ++ ' ':other)
maybe_tuple :: UserString -> Maybe EncodedString
maybe_tuple "(# #)" = Just("Z1H")
maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of
(n, '#' : ')' : _) -> Just ('Z' : shows (n+1) "H")
_ -> Nothing
maybe_tuple "()" = Just("Z0T")
maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of
(n, ')' : _) -> Just ('Z' : shows (n+1) "T")
_ -> Nothing
maybe_tuple _ = Nothing
count_commas :: Int -> String -> (Int, String)
count_commas n (',' : cs) = count_commas (n+1) cs
count_commas n cs = (n,cs)
word64Base62Len :: Int
word64Base62Len = 11
toBase62Padded :: Word64 -> String
toBase62Padded w = pad ++ str
where
pad = replicate len '0'
len = word64Base62Len - length str
str = toBase62 w
toBase62 :: Word64 -> String
toBase62 w = showIntAtBase 62 represent w ""
where
represent :: Int -> Char
represent x
| x < 10 = Char.chr (48 + x)
| x < 36 = Char.chr (65 + x - 10)
| x < 62 = Char.chr (97 + x - 36)
| otherwise = error "represent (base 62): impossible!"