{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE Safe #-}
module Data.YAML.Token.Encoding
( decode
, Encoding(..)
) where
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Char8 as BLC
import Util
data Encoding = UTF8
| UTF16LE
| UTF16BE
| UTF32LE
| UTF32BE
instance Show Encoding where
show UTF8 = "UTF-8"
show UTF16LE = "UTF-16LE"
show UTF16BE = "UTF-16BE"
show UTF32LE = "UTF-32LE"
show UTF32BE = "UTF-32BE"
decode :: BLC.ByteString -> (Encoding, [(Int, Char)])
decode text = (encoding, undoEncoding encoding text)
where encoding = detectEncoding $ BLC.unpack $ BLC.take 4 text
detectEncoding :: [Char] -> Encoding
detectEncoding text =
case text of
'\x00' : '\x00' : '\xFE' : '\xFF' : _ -> UTF32BE
'\x00' : '\x00' : '\x00' : _ : _ -> UTF32BE
'\xFF' : '\xFE' : '\x00' : '\x00' : _ -> UTF32LE
_ : '\x00' : '\x00' : '\x00' : _ -> UTF32LE
'\xFE' : '\xFF' : _ -> UTF16BE
'\x00' : _ : _ -> UTF16BE
'\xFF' : '\xFE' : _ -> UTF16LE
_ : '\x00' : _ -> UTF16LE
'\xEF' : '\xBB' : '\xBF' : _ -> UTF8
_ -> UTF8
undoEncoding :: Encoding -> BLC.ByteString -> [(Int, Char)]
undoEncoding encoding bytes =
case encoding of
UTF8 -> undoUTF8 bytes 0
UTF16LE -> combinePairs $ undoUTF16LE bytes 0
UTF16BE -> combinePairs $ undoUTF16BE bytes 0
UTF32LE -> validateScalars $ undoUTF32LE bytes 0
UTF32BE -> validateScalars $ undoUTF32BE bytes 0
where
validateScalars [] = []
validateScalars (x@(_,c):rest)
| '\xD800' <= c, c <= '\xDFFF' = error "UTF-32 stream contains invalid surrogate code-point"
| otherwise = x : validateScalars rest
hasFewerThan :: Int -> BLC.ByteString -> Bool
hasFewerThan n bytes
| n == 1 = BLC.null bytes
| n > 1 = BLC.null bytes || hasFewerThan (n - 1) (BLC.tail bytes)
| otherwise = False
undoUTF32LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32LE bytes offset
| BLC.null bytes = []
| hasFewerThan 4 bytes = error "UTF-32LE input contains invalid number of bytes"
| otherwise = let first = BLC.head bytes
bytes' = BLC.tail bytes
second = BLC.head bytes'
bytes'' = BLC.tail bytes'
third = BLC.head bytes''
bytes''' = BLC.tail bytes''
fourth = BLC.head bytes'''
rest = BLC.tail bytes'''
in (offset + 4,
chr $ (ord first)
+ 256 * ((ord second)
+ 256 * ((ord third)
+ 256 * ((ord fourth))))):(undoUTF32LE rest $ offset + 4)
undoUTF32BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32BE bytes offset
| BLC.null bytes = []
| hasFewerThan 4 bytes = error "UTF-32BE input contains invalid number of bytes"
| otherwise = let first = BLC.head bytes
bytes' = BLC.tail bytes
second = BLC.head bytes'
bytes'' = BLC.tail bytes'
third = BLC.head bytes''
bytes''' = BLC.tail bytes''
fourth = BLC.head bytes'''
rest = BLC.tail bytes'''
in (offset + 4,
chr $ (ord fourth)
+ 256 * ((ord third)
+ 256 * ((ord second)
+ 256 * ((ord first))))):(undoUTF32BE rest $ offset + 4)
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs [] = []
combinePairs (head'@(_, head_char):tail')
| '\xD800' <= head_char && head_char <= '\xDBFF' = combineLead head' tail'
| '\xDC00' <= head_char && head_char <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate"
| otherwise = head':(combinePairs tail')
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead _lead [] = error "UTF-16 contains lead surrogate as final character"
combineLead (_, lead_char) ((trail_offset, trail_char):rest)
| '\xDC00' <= trail_char && trail_char <= '\xDFFF' = (trail_offset, combineSurrogates lead_char trail_char):combinePairs rest
| otherwise = error "UTF-16 contains lead surrogate without trail surrogate"
surrogateOffset :: Int
surrogateOffset = 0x10000 - (0xD800 * 1024) - 0xDC00
combineSurrogates :: Char -> Char -> Char
combineSurrogates lead trail = chr $ (ord lead) * 1024 + (ord trail) + surrogateOffset
undoUTF16LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16LE bytes offset
| BLC.null bytes = []
| hasFewerThan 2 bytes = error "UTF-16LE input contains odd number of bytes"
| otherwise = let low = BLC.head bytes
bytes' = BLC.tail bytes
high = BLC.head bytes'
rest = BLC.tail bytes'
in (offset + 2, chr $ (ord low) + (ord high) * 256):(undoUTF16LE rest $ offset + 2)
undoUTF16BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16BE bytes offset
| BLC.null bytes = []
| hasFewerThan 2 bytes = error "UTF-16BE input contains odd number of bytes"
| otherwise = let high = BLC.head bytes
bytes' = BLC.tail bytes
low = BLC.head bytes'
rest = BLC.tail bytes'
in (offset + 2, chr $ (ord low) + (ord high) * 256):(undoUTF16BE rest $ offset + 2)
undoUTF8 :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF8 bytes offset = undoUTF8' (BL.unpack bytes) offset
w2c :: Word8 -> Char
w2c = chr . fromIntegral
w2i :: Word8 -> Int
w2i = fromIntegral
undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [] _ = []
undoUTF8' (first:rest) !offset
| first < 0x80 = (offset', c) : undoUTF8' rest offset'
where
!offset' = offset + 1
!c = w2c first
undoUTF8' (first:rest) !offset
| first < 0xC0 = error "UTF-8 input contains invalid first byte"
| first < 0xE0 = decodeTwoUTF8 first offset rest
| first < 0xF0 = decodeThreeUTF8 first offset rest
| first < 0xF8 = decodeFourUTF8 first offset rest
| otherwise = error "UTF-8 input contains invalid first byte"
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 first offset (second:rest)
| second < 0x80 || 0xBF < second = error $ "UTF-8 double byte char has invalid second byte"
| otherwise = (offset', c) : undoUTF8' rest offset'
where
!offset' = offset + 2
!c = chr ((w2i first - 0xc0) * 0x40 + (w2i second - 0x80))
decodeTwoUTF8 _ _ [] = error "UTF-8 double byte char is missing second byte at eof"
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 first offset (second:third:rest)
| second < 0x80 || 0xBF < second = error "UTF-8 triple byte char has invalid second byte"
| third < 0x80 || 0xBF < third = error "UTF-8 triple byte char has invalid third byte"
| otherwise = (offset', c): undoUTF8' rest offset'
where
!offset' = offset + 3
!c = chr(((w2i first) - 0xE0) * 0x1000 +
((w2i second) - 0x80) * 0x40 +
((w2i third) - 0x80))
decodeThreeUTF8 _ _ _ =error "UTF-8 triple byte char is missing bytes at eof"
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 first offset (second:third:fourth:rest)
| second < 0x80 || 0xBF < second = error "UTF-8 quad byte char has invalid second byte"
| third < 0x80 || 0xBF < third = error "UTF-8 quad byte char has invalid third byte"
| third < 0x80 || 0xBF < third = error "UTF-8 quad byte char has invalid fourth byte"
| otherwise = (offset', c) : undoUTF8' rest offset'
where
!offset' = offset + 4
!c = chr(((w2i first) - 0xF0) * 0x40000 +
((w2i second) - 0x80) * 0x1000 +
((w2i third) - 0x80) * 0x40 +
((w2i fourth) - 0x80))
decodeFourUTF8 _ _ _ = error "UTF-8 quad byte char is missing bytes at eof"