module Ideas.Text.UTF8
( encode, encodeM, decode, decodeM
, isUTF8, allBytes, propEncoding
) where
import Control.Monad
import Data.Char
import Data.Maybe
import Test.QuickCheck
encode :: String -> String
encode = either error id . encodeM
decode :: String -> String
decode = either error id . decodeM
encodeM :: Monad m => String -> m String
encodeM = fmap (map chr . concat) . mapM (toUTF8 . ord)
decodeM :: Monad m => String -> m String
decodeM = fmap (map chr) . fromUTF8 . map ord
isUTF8 :: String -> Bool
isUTF8 = isJust . decodeM
allBytes :: String -> Bool
allBytes = all ((`between` (0, 255)) . ord)
toUTF8 :: Monad m => Int -> m [Int]
toUTF8 n
| n < 128 =
return [n]
| n < 2048 =
let (a, d) = n `divMod` 64
in return [a+192, d+128]
| n < 65536 =
let (a, d1) = n `divMod` 4096
(b, d2) = d1 `divMod` 64
in return [a+224, b+128, d2+128]
| n < 1114112 =
let (a, d1) = n `divMod` 262144
(b, d2) = d1 `divMod` 4096
(c, d3) = d2 `divMod` 64
in return [a+240, b+128, c+128, d3+128]
| otherwise =
fail "invalid character in UTF8"
fromUTF8 :: Monad m => [Int] -> m [Int]
fromUTF8 xs
| null xs = return []
| otherwise = do
(i, rest) <- f xs
is <- fromUTF8 rest
return (i:is)
where
f (a:rest) | a < 128 =
return (a, rest)
f (a:b:rest) | a `between` (192, 223) = do
unless (isHigh b) $
fail "invalid UTF8 character (two bytes)"
return ((a-192)*64 + b-128, rest)
f (a:b:c:rest) | a `between` (224, 239) = do
unless (isHigh b && isHigh c) $
fail "invalid UTF8 character (three bytes)"
return ((a-224)*4096 + (b-128)*64 + c-128, rest)
f (a:b:c:d:rest) | a >= 240 && a < 248 = do
let value = (a-240)*262144 + (b-128)*4096 + (c-128)*64 + d-128
unless (isHigh b && isHigh c && isHigh d && value <= 1114111) $
fail "invalid UTF8 character (four bytes)"
return (value, rest)
f _ = fail "invalid character in UTF8"
isHigh :: Int -> Bool
isHigh i = i `between` (128, 191)
between :: Ord a => a -> (a, a) -> Bool
between a (low, high) = low <= a && a <= high
propEncoding :: Property
propEncoding = forAll (sized gen) valid
where
gen n = replicateM n someChar
someChar = chr <$> oneof
[ choose (0, 127), choose (128, 2047)
, choose (2048, 65535), choose (65536, 1114111)
]
valid :: String -> Bool
valid xs = fromMaybe False $
do us <- encodeM xs
bs <- decodeM us
return (xs == bs)