{-# LANGUAGE BangPatterns #-} {-# LANGUAGE Safe #-} -- | -- Copyright: © Oren Ben-Kiki 2007, -- © Herbert Valerio Riedel 2015-2018 -- SPDX-License-Identifier: GPL-2.0-or-later -- -- UTF decoding -- -- This really should be factored out to the standard libraries. Since it isn't -- there, we get to tailor it exactly to our needs. We use lazy byte strings as -- input, which should give reasonable I\/O performance when reading large -- files. The output is a normal 'Char' list which is easy to work with and -- should be efficient enough as long as the 'Parser' does its job right. -- module Data.YAML.Token.Encoding ( decode , Encoding(..) ) where import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BLC import Util -- | Recognized Unicode encodings. As of YAML 1.2 UTF-32 is also required. data Encoding = UTF8 -- ^ UTF-8 encoding (or ASCII) | UTF16LE -- ^ UTF-16 little endian | UTF16BE -- ^ UTF-16 big endian | UTF32LE -- ^ UTF-32 little endian | UTF32BE -- ^ UTF-32 big endian -- | @show encoding@ converts an 'Encoding' to the encoding name (with a "-") -- as used by most programs. instance Show Encoding where show UTF8 = "UTF-8" show UTF16LE = "UTF-16LE" show UTF16BE = "UTF-16BE" show UTF32LE = "UTF-32LE" show UTF32BE = "UTF-32BE" -- | @decode bytes@ automatically detects the 'Encoding' used and converts the -- /bytes/ to Unicode characters, with byte offsets. Note the offset is for -- past end of the character, not its beginning. decode :: BLC.ByteString -> (Encoding, [(Int, Char)]) decode text = (encoding, undoEncoding encoding text) where encoding = detectEncoding $ BLC.unpack $ BLC.take 4 text -- | @detectEncoding text@ examines the first few chars (bytes) of the /text/ -- to deduce the Unicode encoding used according to the YAML spec. detectEncoding :: [Char] -> Encoding detectEncoding text = case text of '\x00' : '\x00' : '\xFE' : '\xFF' : _ -> UTF32BE '\x00' : '\x00' : '\x00' : _ : _ -> UTF32BE '\xFF' : '\xFE' : '\x00' : '\x00' : _ -> UTF32LE _ : '\x00' : '\x00' : '\x00' : _ -> UTF32LE '\xFE' : '\xFF' : _ -> UTF16BE '\x00' : _ : _ -> UTF16BE '\xFF' : '\xFE' : _ -> UTF16LE _ : '\x00' : _ -> UTF16LE '\xEF' : '\xBB' : '\xBF' : _ -> UTF8 _ -> UTF8 -- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode -- characters according to the /encoding/. undoEncoding :: Encoding -> BLC.ByteString -> [(Int, Char)] undoEncoding encoding bytes = case encoding of UTF8 -> undoUTF8 bytes 0 UTF16LE -> combinePairs $ undoUTF16LE bytes 0 UTF16BE -> combinePairs $ undoUTF16BE bytes 0 UTF32LE -> validateScalars $ undoUTF32LE bytes 0 UTF32BE -> validateScalars $ undoUTF32BE bytes 0 where validateScalars [] = [] validateScalars (x@(_,c):rest) | '\xD800' <= c, c <= '\xDFFF' = error "UTF-32 stream contains invalid surrogate code-point" | otherwise = x : validateScalars rest -- ** UTF-32 decoding -- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/ -- left to read. hasFewerThan :: Int -> BLC.ByteString -> Bool hasFewerThan n bytes | n == 1 = BLC.null bytes | n > 1 = BLC.null bytes || hasFewerThan (n - 1) (BLC.tail bytes) | otherwise = False -- | @undoUTF32LE bytes offset@ decoded a UTF-32LE /bytes/ stream to Unicode -- chars. undoUTF32LE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF32LE bytes offset | BLC.null bytes = [] | hasFewerThan 4 bytes = error "UTF-32LE input contains invalid number of bytes" | otherwise = let first = BLC.head bytes bytes' = BLC.tail bytes second = BLC.head bytes' bytes'' = BLC.tail bytes' third = BLC.head bytes'' bytes''' = BLC.tail bytes'' fourth = BLC.head bytes''' rest = BLC.tail bytes''' in (offset + 4, chr $ (ord first) + 256 * ((ord second) + 256 * ((ord third) + 256 * ((ord fourth))))):(undoUTF32LE rest $ offset + 4) -- | @undoUTF32BE bytes offset@ decoded a UTF-32BE /bytes/ stream to Unicode -- chars. undoUTF32BE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF32BE bytes offset | BLC.null bytes = [] | hasFewerThan 4 bytes = error "UTF-32BE input contains invalid number of bytes" | otherwise = let first = BLC.head bytes bytes' = BLC.tail bytes second = BLC.head bytes' bytes'' = BLC.tail bytes' third = BLC.head bytes'' bytes''' = BLC.tail bytes'' fourth = BLC.head bytes''' rest = BLC.tail bytes''' in (offset + 4, chr $ (ord fourth) + 256 * ((ord third) + 256 * ((ord second) + 256 * ((ord first))))):(undoUTF32BE rest $ offset + 4) -- ** UTF-16 decoding -- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a -- single Unicode character. combinePairs :: [(Int, Char)] -> [(Int, Char)] combinePairs [] = [] combinePairs (head'@(_, head_char):tail') | '\xD800' <= head_char && head_char <= '\xDBFF' = combineLead head' tail' | '\xDC00' <= head_char && head_char <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate" | otherwise = head':(combinePairs tail') -- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the -- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues -- combining surrogate pairs. combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)] combineLead _lead [] = error "UTF-16 contains lead surrogate as final character" combineLead (_, lead_char) ((trail_offset, trail_char):rest) | '\xDC00' <= trail_char && trail_char <= '\xDFFF' = (trail_offset, combineSurrogates lead_char trail_char):combinePairs rest | otherwise = error "UTF-16 contains lead surrogate without trail surrogate" -- | @surrogateOffset@ is copied from the Unicode FAQs. surrogateOffset :: Int surrogateOffset = 0x10000 - (0xD800 * 1024) - 0xDC00 -- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single -- Unicode character. combineSurrogates :: Char -> Char -> Char combineSurrogates lead trail = chr $ (ord lead) * 1024 + (ord trail) + surrogateOffset -- | @undoUTF18LE bytes offset@ decoded a UTF-16LE /bytes/ stream to Unicode -- chars. undoUTF16LE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF16LE bytes offset | BLC.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16LE input contains odd number of bytes" | otherwise = let low = BLC.head bytes bytes' = BLC.tail bytes high = BLC.head bytes' rest = BLC.tail bytes' in (offset + 2, chr $ (ord low) + (ord high) * 256):(undoUTF16LE rest $ offset + 2) -- | @undoUTF18BE bytes offset@ decoded a UTF-16BE /bytes/ stream to Unicode -- chars. undoUTF16BE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF16BE bytes offset | BLC.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16BE input contains odd number of bytes" | otherwise = let high = BLC.head bytes bytes' = BLC.tail bytes low = BLC.head bytes' rest = BLC.tail bytes' in (offset + 2, chr $ (ord low) + (ord high) * 256):(undoUTF16BE rest $ offset + 2) -- ** UTF-8 decoding -- | @undoUTF8 bytes offset@ decoded a UTF-8 /bytes/ stream to Unicode chars. undoUTF8 :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF8 bytes offset = undoUTF8' (BL.unpack bytes) offset w2c :: Word8 -> Char w2c = chr . fromIntegral w2i :: Word8 -> Int w2i = fromIntegral undoUTF8' :: [Word8] -> Int -> [(Int, Char)] undoUTF8' [] _ = [] undoUTF8' (first:rest) !offset | first < 0x80 = (offset', c) : undoUTF8' rest offset' where !offset' = offset + 1 !c = w2c first undoUTF8' (first:rest) !offset | first < 0xC0 = error "UTF-8 input contains invalid first byte" | first < 0xE0 = decodeTwoUTF8 first offset rest | first < 0xF0 = decodeThreeUTF8 first offset rest | first < 0xF8 = decodeFourUTF8 first offset rest | otherwise = error "UTF-8 input contains invalid first byte" -- | @decodeTwoUTF8 first offset bytes@ decodes a two-byte UTF-8 character, -- where the /first/ byte is already available and the second is the head of -- the /bytes/, and then continues to undo the UTF-8 encoding. decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)] decodeTwoUTF8 first offset (second:rest) | second < 0x80 || 0xBF < second = error $ "UTF-8 double byte char has invalid second byte" | otherwise = (offset', c) : undoUTF8' rest offset' where !offset' = offset + 2 !c = chr ((w2i first - 0xc0) * 0x40 + (w2i second - 0x80)) decodeTwoUTF8 _ _ [] = error "UTF-8 double byte char is missing second byte at eof" -- | @decodeThreeUTF8 first offset bytes@ decodes a three-byte UTF-8 character, -- where the /first/ byte is already available and the second and third are the -- head of the /bytes/, and then continues to undo the UTF-8 encoding. decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)] decodeThreeUTF8 first offset (second:third:rest) | second < 0x80 || 0xBF < second = error "UTF-8 triple byte char has invalid second byte" | third < 0x80 || 0xBF < third = error "UTF-8 triple byte char has invalid third byte" | otherwise = (offset', c): undoUTF8' rest offset' where !offset' = offset + 3 !c = chr(((w2i first) - 0xE0) * 0x1000 + ((w2i second) - 0x80) * 0x40 + ((w2i third) - 0x80)) decodeThreeUTF8 _ _ _ =error "UTF-8 triple byte char is missing bytes at eof" -- | @decodeFourUTF8 first offset bytes@ decodes a four-byte UTF-8 character, -- where the /first/ byte is already available and the second, third and fourth -- are the head of the /bytes/, and then continues to undo the UTF-8 encoding. decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)] decodeFourUTF8 first offset (second:third:fourth:rest) | second < 0x80 || 0xBF < second = error "UTF-8 quad byte char has invalid second byte" | third < 0x80 || 0xBF < third = error "UTF-8 quad byte char has invalid third byte" | third < 0x80 || 0xBF < third = error "UTF-8 quad byte char has invalid fourth byte" | otherwise = (offset', c) : undoUTF8' rest offset' where !offset' = offset + 4 !c = chr(((w2i first) - 0xF0) * 0x40000 + ((w2i second) - 0x80) * 0x1000 + ((w2i third) - 0x80) * 0x40 + ((w2i fourth) - 0x80)) decodeFourUTF8 _ _ _ = error "UTF-8 quad byte char is missing bytes at eof"