{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
#endif
--
-- |
-- Module      :  Codec.Binary.UTF8.String
-- Copyright   :  (c) Eric Mertens 2007
-- License     :  BSD3-style (see LICENSE)
--
-- Maintainer:    emertens@galois.com
-- Stability   :  experimental
-- Portability :  portable
--
-- Support for encoding UTF8 Strings to and from @['Word8']@
--

module Codec.Binary.UTF8.String (
      encode
    , decode
    , encodeString
    , decodeString
    , encodeChar

    , isUTF8Encoded
    , utf8Encode
  ) where

import Data.Word        (Word8,Word32)
import Data.Bits        ((.|.),(.&.),shiftL,shiftR)
import Data.Char        (chr,ord)

default(Int)

-- | Encode a string using 'encode' and store the result in a 'String'.
encodeString :: String -> String
encodeString :: String -> String
encodeString String
xs = (Word8 -> Char) -> [Word8] -> String
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> (Word8 -> Int) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Int
forall a. Enum a => a -> Int
fromEnum) (String -> [Word8]
encode String
xs)

-- | Decode a string using 'decode' using a 'String' as input.
-- This is not safe but it is necessary if UTF-8 encoded text
-- has been loaded into a 'String' prior to being decoded.
decodeString :: String -> String
decodeString :: String -> String
decodeString String
xs = [Word8] -> String
decode ((Char -> Word8) -> String -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Word8
forall a. Enum a => Int -> a
toEnum (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum) String
xs)

replacement_character :: Char
replacement_character :: Char
replacement_character = Char
'\xfffd'

-- | Encode a single Haskell 'Char' to a list of 'Word8' values, in UTF8 format.
encodeChar :: Char -> [Word8]
encodeChar :: Char -> [Word8]
encodeChar = (Int -> Word8) -> [Int] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Int] -> [Word8]) -> (Char -> [Int]) -> Char -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Int]
forall a. (Ord a, Num a, Bits a) => a -> [a]
go (Int -> [Int]) -> (Char -> Int) -> Char -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord
 where
  go :: a -> [a]
go a
oc
   | a
oc a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
0x7f       = [a
oc]

   | a
oc a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
0x7ff      = [ a
0xc0 a -> a -> a
forall a. Num a => a -> a -> a
+ (a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6)
                        , a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ a
oc a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f
                        ]

   | a
oc a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
0xffff     = [ a
0xe0 a -> a -> a
forall a. Num a => a -> a -> a
+ (a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
12)
                        , a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ ((a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f)
                        , a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ a
oc a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f
                        ]
   | Bool
otherwise        = [ a
0xf0 a -> a -> a
forall a. Num a => a -> a -> a
+ (a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
18)
                        , a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ ((a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
12) a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f)
                        , a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ ((a
oc a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f)
                        , a
0x80 a -> a -> a
forall a. Num a => a -> a -> a
+ a
oc a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0x3f
                        ]


-- | Encode a Haskell 'String' to a list of 'Word8' values, in UTF8 format.
encode :: String -> [Word8]
encode :: String -> [Word8]
encode = (Char -> [Word8]) -> String -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> [Word8]
encodeChar

--
-- | Decode a UTF8 string packed into a list of 'Word8' values, directly to 'String'
--
decode :: [Word8] -> String
decode :: [Word8] -> String
decode [    ] = String
""
decode (Word8
c:[Word8]
cs)
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80  = Int -> Char
chr (Word8 -> Int
forall a. Enum a => a -> Int
fromEnum Word8
c) Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xc0  = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xe0  = String
multi1
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xf0  = Int -> Word8 -> Int -> String
multi_byte Int
2 Word8
0xf  Int
0x800
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xf8  = Int -> Word8 -> Int -> String
multi_byte Int
3 Word8
0x7  Int
0x10000
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xfc  = Int -> Word8 -> Int -> String
multi_byte Int
4 Word8
0x3  Int
0x200000
  | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xfe  = Int -> Word8 -> Int -> String
multi_byte Int
5 Word8
0x1  Int
0x4000000
  | Bool
otherwise = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs
  where
    multi1 :: String
multi1 = case [Word8]
cs of
      Word8
c1 : [Word8]
ds | Word8
c1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0xc0 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x80 ->
        let d :: Int
d = ((Word8 -> Int
forall a. Enum a => a -> Int
fromEnum Word8
c Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x1f) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|.  Word8 -> Int
forall a. Enum a => a -> Int
fromEnum (Word8
c1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3f)
        in if Int
d Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x000080 then Int -> Char
forall a. Enum a => Int -> a
toEnum Int
d Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
ds
                            else Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
ds
      [Word8]
_ -> Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
cs

    multi_byte :: Int -> Word8 -> Int -> [Char]
    multi_byte :: Int -> Word8 -> Int -> String
multi_byte Int
i Word8
mask Int
overlong = Int -> [Word8] -> Int -> String
forall t. (Eq t, Num t) => t -> [Word8] -> Int -> String
aux Int
i [Word8]
cs (Word8 -> Int
forall a. Enum a => a -> Int
fromEnum (Word8
c Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
mask))
      where
        aux :: t -> [Word8] -> Int -> String
aux t
0 [Word8]
rs Int
acc
          | Int
overlong Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
acc Bool -> Bool -> Bool
&& Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x10ffff Bool -> Bool -> Bool
&&
            (Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xd800 Bool -> Bool -> Bool
|| Int
0xdfff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
acc)     Bool -> Bool -> Bool
&&
            (Int
acc Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xfffe Bool -> Bool -> Bool
|| Int
0xffff Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
acc)      = Int -> Char
chr Int
acc Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
rs
          | Bool
otherwise = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
rs

        aux t
n (Word8
r:[Word8]
rs) Int
acc
          | Word8
r Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0xc0 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x80 = t -> [Word8] -> Int -> String
aux (t
nt -> t -> t
forall a. Num a => a -> a -> a
-t
1) [Word8]
rs
                               (Int -> String) -> Int -> String
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftL Int
acc Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Word8 -> Int
forall a. Enum a => a -> Int
fromEnum (Word8
r Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3f)

        aux t
_ [Word8]
rs     Int
_ = Char
replacement_character Char -> String -> String
forall a. a -> [a] -> [a]
: [Word8] -> String
decode [Word8]
rs


-- | @utf8Encode str@ is a convenience function; checks to see if
-- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but
-- you are better off keeping track of the encoding so as to avoid
-- the cost of checking.
utf8Encode :: String -> String
utf8Encode :: String -> String
utf8Encode String
str
 | String -> Bool
isUTF8Encoded String
str = String
str
 | Bool
otherwise         = String -> String
encodeString String
str


-- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form.
isUTF8Encoded :: String -> Bool
isUTF8Encoded :: String -> Bool
isUTF8Encoded [] = Bool
True
isUTF8Encoded (Char
x:String
xs) =
  case Word32
ox of
    Word32
_ | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x80  -> String -> Bool
isUTF8Encoded String
xs
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
> Word32
0xff  -> Bool
False
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xc0  -> Bool
False
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xe0  -> Bool
check1
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xf0  -> Int -> Word32 -> Word32 -> Bool
check_byte Int
2 Word32
0xf Word32
0
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xf8  -> Int -> Word32 -> Word32 -> Bool
check_byte Int
3 Word32
0x7  Word32
0x10000
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xfc  -> Int -> Word32 -> Word32 -> Bool
check_byte Int
4 Word32
0x3  Word32
0x200000
      | Word32
ox Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xfe  -> Int -> Word32 -> Word32 -> Bool
check_byte Int
5 Word32
0x1  Word32
0x4000000
      | Bool
otherwise  -> Bool
False
 where
   ox :: Word32
ox = Char -> Word32
toW32 Char
x

   toW32 :: Char -> Word32
   toW32 :: Char -> Word32
toW32 Char
ch = Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
ch)

   check1 :: Bool
check1 =
    case String
xs of
     [] -> Bool
False
     Char
c1 : String
ds
      | Word32
oc Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0xc0 Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word32
0x80 Bool -> Bool -> Bool
|| Word32
d Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0x000080 -> Bool
False
      | Bool
otherwise -> String -> Bool
isUTF8Encoded String
ds
      where
       oc :: Word32
oc = Char -> Word32
toW32 Char
c1
       d :: Word32
d = ((Word32
ox Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x1f) Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftL` Int
6) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|.  (Word32
oc Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3f)

   check_byte :: Int -> Word32 -> Word32 -> Bool
   check_byte :: Int -> Word32 -> Word32 -> Bool
check_byte Int
i Word32
mask Word32
overlong = Int -> String -> Word32 -> Bool
forall t. (Eq t, Num t) => t -> String -> Word32 -> Bool
aux Int
i String
xs (Word32
ox Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
mask)
      where
        aux :: t -> String -> Word32 -> Bool
aux t
0 String
rs Word32
acc
         | Word32
overlong Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
acc Bool -> Bool -> Bool
&&
           Word32
acc Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
0x10ffff Bool -> Bool -> Bool
&&
           (Word32
acc Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xd800 Bool -> Bool -> Bool
|| Word32
0xdfff Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
acc) Bool -> Bool -> Bool
&&
           (Word32
acc Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
0xfffe Bool -> Bool -> Bool
|| Word32
0xffff Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
< Word32
acc) = String -> Bool
isUTF8Encoded String
rs
         | Bool
otherwise = Bool
False

        aux t
n (Char
r:String
rs) Word32
acc
         | Char -> Word32
toW32 Char
r Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0xc0 Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
0x80 =
            t -> String -> Word32 -> Bool
aux (t
nt -> t -> t
forall a. Num a => a -> a -> a
-t
1) String
rs  (Word32
acc Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftL` Int
6 Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. (Char -> Word32
toW32 Char
r Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3f))

        aux t
_ String
_  Word32
_ = Bool
False