module Data.String.Unicode
(
Unicode,
UString,
UTF8Char,
UTF8String,
UStringWithErrors,
DecodingFct,
DecodingFctEmbedErrors,
utf8ToUnicode
, utf8ToUnicodeEmbedErrors
, latin1ToUnicode
, ucs2ToUnicode
, ucs2BigEndianToUnicode
, ucs2LittleEndianToUnicode
, utf16beToUnicode
, utf16leToUnicode
, unicodeCharToUtf8
, unicodeToUtf8
, unicodeToXmlEntity
, unicodeToLatin1
, unicodeRemoveNoneAscii
, unicodeRemoveNoneLatin1
, intToCharRef
, intToCharRefHex
, intToHexString
, getDecodingFct
, getDecodingFctEmbedErrors
, getOutputEncodingFct
, normalizeNL
, guessEncoding
, getOutputEncodingFct'
, unicodeCharToUtf8'
, unicodeCharToXmlEntity'
, unicodeCharToLatin1'
)
where
import Data.Char ( toUpper )
import Data.Char.Properties.XMLCharProps( isXml1ByteChar
, isXmlLatin1Char
)
import Data.Char.IsoLatinTables
import Data.String.UTF8Decoding ( decodeUtf8, decodeUtf8EmbedErrors )
import Data.String.EncodingNames
type Unicode = Char
type UString = [Unicode]
type UTF8Char = Char
type UTF8String = String
type DecodingFct = String -> (UString, [String])
type UStringWithErrors = [Either String Char]
type DecodingFctEmbedErrors = String -> UStringWithErrors
unicodeToUtf8 :: UString -> UTF8String
unicodeToUtf8 = concatMap unicodeCharToUtf8
unicodeCharToUtf8 :: Unicode -> UTF8String
unicodeCharToUtf8 c
| i >= 0 && i <= 0x0000007F
= [ toEnum i ]
| i >= 0x00000080 && i <= 0x000007FF
= [ toEnum (0xC0 + i `div` 0x40)
, toEnum (0x80 + i `mod` 0x40)
]
| i >= 0x00000800 && i <= 0x0000FFFF
= [ toEnum (0xE0 + i `div` 0x1000)
, toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
, toEnum (0x80 + i `mod` 0x40)
]
| i >= 0x00010000 && i <= 0x001FFFFF
= [ toEnum (0xF0 + i `div` 0x40000)
, toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)
, toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
, toEnum (0x80 + i `mod` 0x40)
]
| i >= 0x00200000 && i <= 0x03FFFFFF
= [ toEnum (0xF8 + i `div` 0x1000000)
, toEnum (0x80 + (i `div` 0x40000) `mod` 0x40)
, toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)
, toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
, toEnum (0x80 + i `mod` 0x40)
]
| i >= 0x04000000 && i <= 0x7FFFFFFF
= [ toEnum (0xFC + i `div` 0x40000000)
, toEnum (0x80 + (i `div` 0x1000000) `mod` 0x40)
, toEnum (0x80 + (i `div` 0x40000) `mod` 0x40)
, toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)
, toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
, toEnum (0x80 + i `mod` 0x40)
]
| otherwise
= error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
where
i = fromEnum c
latin1ToUnicode :: String -> UString
latin1ToUnicode = id
latinToUnicode :: [(Char, Char)] -> String -> UString
latinToUnicode tt
= map charToUni
where
charToUni c =
foldr (\(src,dst) r ->
case compare c src of
EQ -> dst
LT -> c
GT -> r) c tt
decodeAscii :: DecodingFct
decodeAscii
= swap . partitionEither . decodeAsciiEmbedErrors
decodeAsciiEmbedErrors :: String -> UStringWithErrors
decodeAsciiEmbedErrors str
= map (\(c,pos) -> if isValid c
then Right c
else Left (toErrStr c pos)) posStr
where
posStr = zip str [(0::Int)..]
toErrStr errChr pos
= " at input position " ++ show pos ++ ": none ASCII char " ++ show errChr
isValid x = x < '\x80'
ucs2BigEndianToUnicode :: String -> UString
ucs2BigEndianToUnicode (b : l : r)
= toEnum (fromEnum b * 256 + fromEnum l) : ucs2BigEndianToUnicode r
ucs2BigEndianToUnicode []
= []
ucs2BigEndianToUnicode _
= []
ucs2LittleEndianToUnicode :: String -> UString
ucs2LittleEndianToUnicode (l : b : r)
= toEnum (fromEnum b * 256 + fromEnum l) : ucs2LittleEndianToUnicode r
ucs2LittleEndianToUnicode []
= []
ucs2LittleEndianToUnicode [_]
= []
ucs2ToUnicode :: String -> UString
ucs2ToUnicode ('\xFE':'\xFF':s)
= ucs2BigEndianToUnicode s
ucs2ToUnicode ('\xFF':'\xFE':s)
= ucs2LittleEndianToUnicode s
ucs2ToUnicode s
= ucs2BigEndianToUnicode s
utf8ToUnicode :: DecodingFct
utf8ToUnicode ('\xEF':'\xBB':'\xBF':s)
= decodeUtf8 s
utf8ToUnicode s
= decodeUtf8 s
utf8ToUnicodeEmbedErrors :: DecodingFctEmbedErrors
utf8ToUnicodeEmbedErrors ('\xEF':'\xBB':'\xBF':s)
= decodeUtf8EmbedErrors s
utf8ToUnicodeEmbedErrors s
= decodeUtf8EmbedErrors s
utf16beToUnicode :: String -> UString
utf16beToUnicode ('\xFE':'\xFF':s)
= ucs2BigEndianToUnicode s
utf16beToUnicode s
= ucs2BigEndianToUnicode s
utf16leToUnicode :: String -> UString
utf16leToUnicode ('\xFF':'\xFE':s)
= ucs2LittleEndianToUnicode s
utf16leToUnicode s
= ucs2LittleEndianToUnicode s
unicodeToXmlEntity :: UString -> String
unicodeToXmlEntity
= escape isXml1ByteChar (intToCharRef . fromEnum)
unicodeToLatin1 :: UString -> String
unicodeToLatin1
= escape isXmlLatin1Char (intToCharRef . fromEnum)
escape :: (Unicode -> Bool) -> (Unicode -> String) -> UString -> String
escape check esc =
concatMap (\uc -> if check uc then [uc] else esc uc)
unicodeRemoveNoneAscii :: UString -> String
unicodeRemoveNoneAscii
= filter isXml1ByteChar
unicodeRemoveNoneLatin1 :: UString -> String
unicodeRemoveNoneLatin1
= filter isXmlLatin1Char
intToCharRef :: Int -> String
intToCharRef i
= "&#" ++ show i ++ ";"
intToCharRefHex :: Int -> String
intToCharRefHex i
= "&#x" ++ h2 ++ ";"
where
h1 = intToHexString i
h2 = if length h1 `mod` 2 == 1
then '0': h1
else h1
intToHexString :: Int -> String
intToHexString i
| i == 0
= "0"
| i > 0
= intToStr i
| otherwise
= error ("intToHexString: negative argument " ++ show i)
where
intToStr 0 = ""
intToStr i' = intToStr (i' `div` 16) ++ [fourBitsToChar (i' `mod` 16)]
fourBitsToChar :: Int -> Char
fourBitsToChar i = "0123456789ABCDEF" !! i
normalizeNL :: String -> String
normalizeNL ('\r' : '\n' : rest) = '\n' : normalizeNL rest
normalizeNL ('\r' : rest) = '\n' : normalizeNL rest
normalizeNL (c : rest) = c : normalizeNL rest
normalizeNL [] = []
decodingTable :: [(String, DecodingFct)]
decodingTable
= [ (utf8, utf8ToUnicode )
, (isoLatin1, liftDecFct latin1ToUnicode )
, (usAscii, decodeAscii )
, (ucs2, liftDecFct ucs2ToUnicode )
, (utf16, liftDecFct ucs2ToUnicode )
, (utf16be, liftDecFct utf16beToUnicode )
, (utf16le, liftDecFct utf16leToUnicode )
, (iso8859_2, liftDecFct (latinToUnicode iso_8859_2) )
, (iso8859_3, liftDecFct (latinToUnicode iso_8859_3) )
, (iso8859_4, liftDecFct (latinToUnicode iso_8859_4) )
, (iso8859_5, liftDecFct (latinToUnicode iso_8859_5) )
, (iso8859_6, liftDecFct (latinToUnicode iso_8859_6) )
, (iso8859_7, liftDecFct (latinToUnicode iso_8859_7) )
, (iso8859_8, liftDecFct (latinToUnicode iso_8859_8) )
, (iso8859_9, liftDecFct (latinToUnicode iso_8859_9) )
, (iso8859_10, liftDecFct (latinToUnicode iso_8859_10) )
, (iso8859_11, liftDecFct (latinToUnicode iso_8859_11) )
, (iso8859_13, liftDecFct (latinToUnicode iso_8859_13) )
, (iso8859_14, liftDecFct (latinToUnicode iso_8859_14) )
, (iso8859_15, liftDecFct (latinToUnicode iso_8859_15) )
, (iso8859_16, liftDecFct (latinToUnicode iso_8859_16) )
, (unicodeString, liftDecFct id )
, ("", liftDecFct id )
]
where
liftDecFct df = \ s -> (df s, [])
getDecodingFct :: String -> Maybe DecodingFct
getDecodingFct enc
= lookup (map toUpper enc) decodingTable
decodingTableEmbedErrors :: [(String, DecodingFctEmbedErrors)]
decodingTableEmbedErrors
= [ (utf8, utf8ToUnicodeEmbedErrors )
, (isoLatin1, liftDecFct latin1ToUnicode )
, (usAscii, decodeAsciiEmbedErrors )
, (ucs2, liftDecFct ucs2ToUnicode )
, (utf16, liftDecFct ucs2ToUnicode )
, (utf16be, liftDecFct utf16beToUnicode )
, (utf16le, liftDecFct utf16leToUnicode )
, (iso8859_2, liftDecFct (latinToUnicode iso_8859_2) )
, (iso8859_3, liftDecFct (latinToUnicode iso_8859_3) )
, (iso8859_4, liftDecFct (latinToUnicode iso_8859_4) )
, (iso8859_5, liftDecFct (latinToUnicode iso_8859_5) )
, (iso8859_6, liftDecFct (latinToUnicode iso_8859_6) )
, (iso8859_7, liftDecFct (latinToUnicode iso_8859_7) )
, (iso8859_8, liftDecFct (latinToUnicode iso_8859_8) )
, (iso8859_9, liftDecFct (latinToUnicode iso_8859_9) )
, (iso8859_10, liftDecFct (latinToUnicode iso_8859_10) )
, (iso8859_11, liftDecFct (latinToUnicode iso_8859_11) )
, (iso8859_13, liftDecFct (latinToUnicode iso_8859_13) )
, (iso8859_14, liftDecFct (latinToUnicode iso_8859_14) )
, (iso8859_15, liftDecFct (latinToUnicode iso_8859_15) )
, (iso8859_16, liftDecFct (latinToUnicode iso_8859_16) )
, (unicodeString, liftDecFct id )
, ("", liftDecFct id )
]
where
liftDecFct df = map Right . df
getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors
getDecodingFctEmbedErrors enc
= lookup (map toUpper enc) decodingTableEmbedErrors
outputEncodingTable :: [(String, (UString -> String))]
outputEncodingTable
= [ (utf8, unicodeToUtf8 )
, (isoLatin1, unicodeToLatin1 )
, (usAscii, unicodeToXmlEntity )
, (unicodeString, id )
, ("", unicodeToUtf8 )
]
getOutputEncodingFct :: String -> Maybe (String -> UString)
getOutputEncodingFct enc
= lookup (map toUpper enc) outputEncodingTable
guessEncoding :: String -> String
guessEncoding ('\xFF':'\xFE':'\x00':'\x00':_) = "UCS-4LE"
guessEncoding ('\xFF':'\xFE':_) = "UTF-16LE"
guessEncoding ('\xFE':'\xFF':'\x00':'\x00':_) = "UCS-4-3421"
guessEncoding ('\xFE':'\xFF':_) = "UTF-16BE"
guessEncoding ('\xEF':'\xBB':'\xBF':_) = utf8
guessEncoding ('\x00':'\x00':'\xFE':'\xFF':_) = "UCS-4BE"
guessEncoding ('\x00':'\x00':'\xFF':'\xFE':_) = "UCS-4-2143"
guessEncoding ('\x00':'\x00':'\x00':'\x3C':_) = "UCS-4BE"
guessEncoding ('\x3C':'\x00':'\x00':'\x00':_) = "UCS-4LE"
guessEncoding ('\x00':'\x00':'\x3C':'\x00':_) = "UCS-4-2143"
guessEncoding ('\x00':'\x3C':'\x00':'\x00':_) = "UCS-4-3412"
guessEncoding ('\x00':'\x3C':'\x00':'\x3F':_) = "UTF-16BE"
guessEncoding ('\x3C':'\x00':'\x3F':'\x00':_) = "UTF-16LE"
guessEncoding ('\x4C':'\x6F':'\xA7':'\x94':_) = "EBCDIC"
guessEncoding _ = ""
swap :: (a,b) -> (b,a)
swap (x,y) = (y,x)
partitionEither :: [Either a b] -> ([a], [b])
partitionEither =
foldr (\x ~(ls,rs) -> either (\l -> (l:ls,rs)) (\r -> (ls,r:rs)) x) ([],[])
type StringFct = String -> String
outputEncodingTable' :: [(String, (Char -> StringFct))]
outputEncodingTable'
= [ (utf8, unicodeCharToUtf8' )
, (isoLatin1, unicodeCharToLatin1' )
, (usAscii, unicodeCharToXmlEntity' )
, ("", unicodeCharToUtf8' )
]
getOutputEncodingFct' :: String -> Maybe (Char -> StringFct)
getOutputEncodingFct' enc
= lookup (map toUpper enc) outputEncodingTable'
unicodeCharToUtf8' :: Char -> StringFct
unicodeCharToUtf8' c
| i >= 0 && i <= 0x0000007F
= (c :)
| i >= 0x00000080 && i <= 0x000007FF
= ((toEnum (0xC0 + i `div` 0x40) ) :) .
((toEnum (0x80 + i `mod` 0x40)) :)
| i >= 0x00000800 && i <= 0x0000FFFF
= ((toEnum (0xE0 + i `div` 0x1000) ) :) .
((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) .
((toEnum (0x80 + i `mod` 0x40)) :)
| i >= 0x00010000 && i <= 0x001FFFFF
= ((toEnum (0xF0 + i `div` 0x40000) ) :) .
((toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)) :) .
((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) .
((toEnum (0x80 + i `mod` 0x40)) :)
| i >= 0x00200000 && i <= 0x03FFFFFF
= ((toEnum (0xF8 + i `div` 0x1000000) ) :) .
((toEnum (0x80 + (i `div` 0x40000) `mod` 0x40)) :) .
((toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)) :) .
((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) .
((toEnum (0x80 + i `mod` 0x40)) :)
| i >= 0x04000000 && i <= 0x7FFFFFFF
= ((toEnum (0xFC + i `div` 0x40000000) ) :) .
((toEnum (0x80 + (i `div` 0x1000000) `mod` 0x40)) :) .
((toEnum (0x80 + (i `div` 0x40000) `mod` 0x40)) :) .
((toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)) :) .
((toEnum (0x80 + (i `div` 0x40) `mod` 0x40)) :) .
((toEnum (0x80 + i `mod` 0x40)) :)
| otherwise
= error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
where
i = fromEnum c
unicodeCharToXmlEntity' :: Char -> StringFct
unicodeCharToXmlEntity' c
| isXml1ByteChar c = (c :)
| otherwise = ((intToCharRef . fromEnum $ c) ++)
unicodeCharToLatin1' :: Char -> StringFct
unicodeCharToLatin1' c
| isXmlLatin1Char c = (c :)
| otherwise = ((intToCharRef . fromEnum $ c) ++)