{-# LANGUAGE BangPatterns  #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE Safe          #-}

-- |
-- Copyright: © Oren Ben-Kiki 2007,
--            © Herbert Valerio Riedel 2015-2018
-- SPDX-License-Identifier: GPL-2.0-or-later
--
-- UTF decoding
--
-- This really should be factored out to the standard libraries. Since it isn't
-- there, we get to tailor it exactly to our needs. We use lazy byte strings as
-- input, which should give reasonable I\/O performance when reading large
-- files. The output is a normal 'Char' list which is easy to work with and
-- should be efficient enough as long as the 'Data.YAML.Token.Parser' does its job right.
--
module Data.YAML.Token.Encoding
  ( decode
  , Encoding(..)
  ) where

import qualified Data.ByteString.Lazy       as BL
import qualified Data.ByteString.Lazy.Char8 as BLC

import           Util

-- | Recognized Unicode encodings. As of YAML 1.2 UTF-32 is also required.
data Encoding = UTF8    -- ^ UTF-8 encoding (or ASCII)
              | UTF16LE -- ^ UTF-16 little endian
              | UTF16BE -- ^ UTF-16 big endian
              | UTF32LE -- ^ UTF-32 little endian
              | UTF32BE -- ^ UTF-32 big endian
              deriving (Encoding -> Encoding -> Bool
(Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool) -> Eq Encoding
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Encoding -> Encoding -> Bool
$c/= :: Encoding -> Encoding -> Bool
== :: Encoding -> Encoding -> Bool
$c== :: Encoding -> Encoding -> Bool
Eq,(forall x. Encoding -> Rep Encoding x)
-> (forall x. Rep Encoding x -> Encoding) -> Generic Encoding
forall x. Rep Encoding x -> Encoding
forall x. Encoding -> Rep Encoding x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Encoding x -> Encoding
$cfrom :: forall x. Encoding -> Rep Encoding x
Generic)

-- | @show encoding@ converts an 'Encoding' to the encoding name (with a "-")
-- as used by most programs.
instance Show Encoding where
    show :: Encoding -> String
show UTF8    = "UTF-8"
    show UTF16LE = "UTF-16LE"
    show UTF16BE = "UTF-16BE"
    show UTF32LE = "UTF-32LE"
    show UTF32BE = "UTF-32BE"


-- | @since 0.2.0
instance NFData Encoding where rnf :: Encoding -> ()
rnf !Encoding
_ = ()

-- | @decode bytes@ automatically detects the 'Encoding' used and converts the
-- /bytes/ to Unicode characters, with byte offsets. Note the offset is for
-- past end of the character, not its beginning.
decode :: BLC.ByteString -> (Encoding, [(Int, Char)])
decode :: ByteString -> (Encoding, [(Int, Char)])
decode text :: ByteString
text = (Encoding
encoding, Encoding -> ByteString -> [(Int, Char)]
undoEncoding Encoding
encoding ByteString
text)
  where encoding :: Encoding
encoding = String -> Encoding
detectEncoding (String -> Encoding) -> String -> Encoding
forall a b. (a -> b) -> a -> b
$ ByteString -> String
BLC.unpack (ByteString -> String) -> ByteString -> String
forall a b. (a -> b) -> a -> b
$ Int64 -> ByteString -> ByteString
BLC.take 4 ByteString
text

-- | @detectEncoding text@ examines the first few chars (bytes) of the /text/
-- to deduce the Unicode encoding used according to the YAML spec.
detectEncoding :: [Char] -> Encoding
detectEncoding :: String -> Encoding
detectEncoding text :: String
text =
  case String
text of
    '\x00' : '\x00' : '\xFE' : '\xFF' : _ -> Encoding
UTF32BE
    '\x00' : '\x00' : '\x00' : _      : _ -> Encoding
UTF32BE
    '\xFF' : '\xFE' : '\x00' : '\x00' : _ -> Encoding
UTF32LE
    _      : '\x00' : '\x00' : '\x00' : _ -> Encoding
UTF32LE
    '\xFE' : '\xFF' : _                   -> Encoding
UTF16BE
    '\x00' : _      : _                   -> Encoding
UTF16BE
    '\xFF' : '\xFE' : _                   -> Encoding
UTF16LE
    _      : '\x00' : _                   -> Encoding
UTF16LE
    '\xEF' : '\xBB' : '\xBF' : _          -> Encoding
UTF8
    _                                     -> Encoding
UTF8

-- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode
-- characters according to the /encoding/.
undoEncoding :: Encoding -> BLC.ByteString -> [(Int, Char)]
undoEncoding :: Encoding -> ByteString -> [(Int, Char)]
undoEncoding encoding :: Encoding
encoding bytes :: ByteString
bytes =
  case Encoding
encoding of
    UTF8    -> ByteString -> Int -> [(Int, Char)]
undoUTF8 ByteString
bytes 0
    UTF16LE -> [(Int, Char)] -> [(Int, Char)]
combinePairs ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
bytes 0
    UTF16BE -> [(Int, Char)] -> [(Int, Char)]
combinePairs ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
bytes 0
    UTF32LE -> [(Int, Char)] -> [(Int, Char)]
forall a. [(a, Char)] -> [(a, Char)]
validateScalars ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
bytes 0
    UTF32BE -> [(Int, Char)] -> [(Int, Char)]
forall a. [(a, Char)] -> [(a, Char)]
validateScalars ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
bytes 0
  where
    validateScalars :: [(a, Char)] -> [(a, Char)]
validateScalars [] = []
    validateScalars (x :: (a, Char)
x@(_,c :: Char
c):rest :: [(a, Char)]
rest)
      | '\xD800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c, Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xDFFF' = String -> [(a, Char)]
forall a. HasCallStack => String -> a
error "UTF-32 stream contains invalid surrogate code-point"
      | Bool
otherwise                    = (a, Char)
x (a, Char) -> [(a, Char)] -> [(a, Char)]
forall a. a -> [a] -> [a]
: [(a, Char)] -> [(a, Char)]
validateScalars [(a, Char)]
rest

-- ** UTF-32 decoding

-- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/
-- left to read.
hasFewerThan :: Int -> BLC.ByteString -> Bool
hasFewerThan :: Int -> ByteString -> Bool
hasFewerThan n :: Int
n bytes :: ByteString
bytes
  | Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 1 = ByteString -> Bool
BLC.null ByteString
bytes
  | Int
n  Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 1 = ByteString -> Bool
BLC.null ByteString
bytes Bool -> Bool -> Bool
|| Int -> ByteString -> Bool
hasFewerThan (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- 1) (ByteString -> ByteString
BLC.tail ByteString
bytes)
  | Bool
otherwise = Bool
False

-- | @undoUTF32LE bytes offset@ decoded a UTF-32LE /bytes/ stream to Unicode
-- chars.
undoUTF32LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32LE :: ByteString -> Int -> [(Int, Char)]
undoUTF32LE bytes :: ByteString
bytes offset :: Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan 4 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-32LE input contains invalid number of bytes"
  | Bool
otherwise = let first :: Char
first    = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes'   = ByteString -> ByteString
BLC.tail ByteString
bytes
                    second :: Char
second   = ByteString -> Char
BLC.head ByteString
bytes'
                    bytes'' :: ByteString
bytes''  = ByteString -> ByteString
BLC.tail ByteString
bytes'
                    third :: Char
third    = ByteString -> Char
BLC.head ByteString
bytes''
                    bytes''' :: ByteString
bytes''' = ByteString -> ByteString
BLC.tail ByteString
bytes''
                    fourth :: Char
fourth   = ByteString -> Char
BLC.head ByteString
bytes'''
                    rest :: ByteString
rest     = ByteString -> ByteString
BLC.tail ByteString
bytes'''
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 4,
                    Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$        Char -> Int
ord Char
first
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
second
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
third
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 256 Int -> Int -> Int
forall a. Num a => a -> a -> a
*  Char -> Int
ord Char
fourth)))(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 4)

-- | @undoUTF32BE bytes offset@ decoded a UTF-32BE /bytes/ stream to Unicode
-- chars.
undoUTF32BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32BE :: ByteString -> Int -> [(Int, Char)]
undoUTF32BE bytes :: ByteString
bytes offset :: Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan 4 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-32BE input contains invalid number of bytes"
  | Bool
otherwise = let first :: Char
first    = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes'   = ByteString -> ByteString
BLC.tail ByteString
bytes
                    second :: Char
second   = ByteString -> Char
BLC.head ByteString
bytes'
                    bytes'' :: ByteString
bytes''  = ByteString -> ByteString
BLC.tail ByteString
bytes'
                    third :: Char
third    = ByteString -> Char
BLC.head ByteString
bytes''
                    bytes''' :: ByteString
bytes''' = ByteString -> ByteString
BLC.tail ByteString
bytes''
                    fourth :: Char
fourth   = ByteString -> Char
BLC.head ByteString
bytes'''
                    rest :: ByteString
rest     = ByteString -> ByteString
BLC.tail ByteString
bytes'''
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 4,
                    Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$        Char -> Int
ord Char
fourth
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
third
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
second
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 256 Int -> Int -> Int
forall a. Num a => a -> a -> a
*  Char -> Int
ord Char
first)))(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 4)

-- ** UTF-16 decoding

-- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a
-- single Unicode character.
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs []                          = []
combinePairs (head' :: (Int, Char)
head'@(_, head_char :: Char
head_char):tail' :: [(Int, Char)]
tail')
  | '\xD800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
head_char Bool -> Bool -> Bool
&& Char
head_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xDBFF' = (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead (Int, Char)
head' [(Int, Char)]
tail'
  | '\xDC00' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
head_char Bool -> Bool -> Bool
&& Char
head_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xDFFF' = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-16 contains trail surrogate without lead surrogate"
  | Bool
otherwise                                      = (Int, Char)
head'(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:[(Int, Char)] -> [(Int, Char)]
combinePairs [(Int, Char)]
tail'

-- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the
-- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues
-- combining surrogate pairs.
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead _lead :: (Int, Char)
_lead []                                 = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-16 contains lead surrogate as final character"
combineLead (_, lead_char :: Char
lead_char) ((trail_offset :: Int
trail_offset, trail_char :: Char
trail_char):rest :: [(Int, Char)]
rest)
  | '\xDC00' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
trail_char Bool -> Bool -> Bool
&& Char
trail_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xDFFF' = (Int
trail_offset, Char -> Char -> Char
combineSurrogates Char
lead_char Char
trail_char)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:[(Int, Char)] -> [(Int, Char)]
combinePairs [(Int, Char)]
rest
  | Bool
otherwise                                        = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-16 contains lead surrogate without trail surrogate"

-- | @surrogateOffset@ is copied from the Unicode FAQs.
surrogateOffset :: Int
surrogateOffset :: Int
surrogateOffset = 0x10000 Int -> Int -> Int
forall a. Num a => a -> a -> a
- (0xD800 Int -> Int -> Int
forall a. Num a => a -> a -> a
* 1024) Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0xDC00

-- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single
-- Unicode character.
combineSurrogates :: Char -> Char -> Char
combineSurrogates :: Char -> Char -> Char
combineSurrogates lead :: Char
lead trail :: Char
trail = Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
lead Int -> Int -> Int
forall a. Num a => a -> a -> a
* 1024 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
trail Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
surrogateOffset

-- | @undoUTF18LE bytes offset@ decoded a UTF-16LE /bytes/ stream to Unicode
-- chars.
undoUTF16LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16LE :: ByteString -> Int -> [(Int, Char)]
undoUTF16LE bytes :: ByteString
bytes offset :: Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan 2 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-16LE input contains odd number of bytes"
  | Bool
otherwise = let low :: Char
low    = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
                    high :: Char
high   = ByteString -> Char
BLC.head ByteString
bytes'
                    rest :: ByteString
rest   = ByteString -> ByteString
BLC.tail ByteString
bytes'
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 2, Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
low Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
high Int -> Int -> Int
forall a. Num a => a -> a -> a
* 256)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 2)

-- | @undoUTF18BE bytes offset@ decoded a UTF-16BE /bytes/ stream to Unicode
-- chars.
undoUTF16BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16BE :: ByteString -> Int -> [(Int, Char)]
undoUTF16BE bytes :: ByteString
bytes offset :: Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan 2 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-16BE input contains odd number of bytes"
  | Bool
otherwise = let high :: Char
high   = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
                    low :: Char
low    = ByteString -> Char
BLC.head ByteString
bytes'
                    rest :: ByteString
rest   = ByteString -> ByteString
BLC.tail ByteString
bytes'
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 2, Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
low Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
high Int -> Int -> Int
forall a. Num a => a -> a -> a
* 256)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 2)

-- ** UTF-8 decoding

-- | @undoUTF8 bytes offset@ decoded a UTF-8 /bytes/ stream to Unicode chars.
undoUTF8 :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF8 :: ByteString -> Int -> [(Int, Char)]
undoUTF8 bytes :: ByteString
bytes = [Word8] -> Int -> [(Int, Char)]
undoUTF8' (ByteString -> [Word8]
BL.unpack ByteString
bytes)

w2c :: Word8 -> Char
w2c :: Word8 -> Char
w2c = Int -> Char
chr (Int -> Char) -> (Word8 -> Int) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

w2i :: Word8 -> Int
w2i :: Word8 -> Int
w2i = Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [] _ = []
undoUTF8' (first :: Word8
first:rest :: [Word8]
rest) !Int
offset
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80  = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1
    !c :: Char
c       = Word8 -> Char
w2c Word8
first
undoUTF8' (first :: Word8
first:rest :: [Word8]
rest) !Int
offset
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xC0  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 input contains invalid first byte"
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xE0  = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8   Word8
first Int
offset [Word8]
rest
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xF0  = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 Word8
first Int
offset [Word8]
rest
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0xF8  = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8  Word8
first Int
offset [Word8]
rest
  | Bool
otherwise     = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 input contains invalid first byte"

-- | @decodeTwoUTF8 first offset bytes@ decodes a two-byte UTF-8 character,
-- where the /first/ byte is already available and the second is the head of
-- the /bytes/, and then continues to undo the UTF-8 encoding.
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 first :: Word8
first offset :: Int
offset (second :: Word8
second:rest :: [Word8]
rest)
  | Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80 Bool -> Bool -> Bool
|| 0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 double byte char has invalid second byte"
  | Bool
otherwise = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 2
    !c :: Char
c       = Int -> Char
chr ((Word8 -> Int
w2i Word8
first Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0xc0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* 0x40  Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0x80))
decodeTwoUTF8 _ _ [] = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 double byte char is missing second byte at eof"

-- | @decodeThreeUTF8 first offset bytes@ decodes a three-byte UTF-8 character,
-- where the /first/ byte is already available and the second and third are the
-- head of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 first :: Word8
first offset :: Int
offset (second :: Word8
second:third :: Word8
third:rest :: [Word8]
rest)
  | Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80 Bool -> Bool -> Bool
|| 0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 triple byte char has invalid second byte"
  | Word8
third Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<  0x80 Bool -> Bool -> Bool
|| 0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 triple byte char has invalid third byte"
  | Bool
otherwise = (Int
offset', Char
c)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 3
    !c :: Char
c       = Int -> Char
chr((Word8 -> Int
w2i Word8
first  Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0xE0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* 0x1000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* 0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
third  Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0x80))
decodeThreeUTF8 _ _ _ =String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 triple byte char is missing bytes at eof"

-- | @decodeFourUTF8 first offset bytes@ decodes a four-byte UTF-8 character,
-- where the /first/ byte is already available and the second, third and fourth
-- are the head of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 first :: Word8
first offset :: Int
offset (second :: Word8
second:third :: Word8
third:fourth :: Word8
fourth:rest :: [Word8]
rest)
  | Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80 Bool -> Bool -> Bool
|| 0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 quad byte char has invalid second byte"
  | Word8
third  Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80 Bool -> Bool -> Bool
|| 0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 quad byte char has invalid third byte"
  | Word8
third  Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< 0x80 Bool -> Bool -> Bool
|| 0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 quad byte char has invalid fourth byte"
  | Bool
otherwise                      = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 4
    !c :: Char
c       = Int -> Char
chr((Word8 -> Int
w2i Word8
first  Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0xF0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* 0x40000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* 0x1000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
third  Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* 0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
fourth Int -> Int -> Int
forall a. Num a => a -> a -> a
- 0x80))

decodeFourUTF8 _ _ _ = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error "UTF-8 quad byte char is missing bytes at eof"