-- Internal module to support UTF8
module UTF8 (toUTF8BS) where

import Data.Char (ord)
import Data.Bits
import Data.Word (Word8)
import qualified Data.ByteString as BS

toUTF8BS :: String -> BS.ByteString
toUTF8BS :: String -> ByteString
toUTF8BS = [Word8] -> ByteString
BS.pack ([Word8] -> ByteString)
-> (String -> [Word8]) -> String -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [Word8]
encodeStringUtf8

-- | Encode 'String' to a list of UTF8-encoded octets
--
-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
-- as the replacement character (i.e. @U+FFFD@).
--
-- The code is extracted from Cabal library, written originally HVR
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 :: String -> [Word8]
encodeStringUtf8 []        = []
encodeStringUtf8 (Char
c:String
cs)
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x07F' = Word8
w8
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x7FF' = (Word8
0xC0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR  Int
6          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xD7FF'= (Word8
0xE0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR Int
12          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR  Int
6 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF'= Word8
0xEF Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: Word8
0xBF Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: Word8
0xBD -- U+FFFD
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFFFF'= (Word8
0xE0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR Int
12          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR  Int
6 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  | Bool
otherwise    = (Word8
0xf0 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|.  Int -> Word8
w8ShiftR Int
18          )
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR Int
12 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Int -> Word8
w8ShiftR  Int
6 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: (Word8
0x80 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. (Word8
w8          Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F))
                 Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: String -> [Word8]
encodeStringUtf8 String
cs
  where
    w8 :: Word8
w8 = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) :: Word8
    w8ShiftR :: Int -> Word8
    w8ShiftR :: Int -> Word8
w8ShiftR = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Int -> Int) -> Int -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int -> Int
forall a. Bits a => a -> Int -> a
shiftR (Char -> Int
ord Char
c)