Copyright | Copyright (C) 2010- Uwe Schmidt |
---|---|
License | MIT |
Maintainer | Uwe Schmidt (uwe@fh-wedel.de) |
Stability | stable |
Portability | portable |
Safe Haskell | Safe-Inferred |
Language | Haskell98 |
Unicode and UTF-8 Conversion Functions
- type Unicode = Char
- type UString = [Unicode]
- type UTF8Char = Char
- type UTF8String = String
- type UStringWithErrors = [Either String Char]
- type DecodingFct = String -> (UString, [String])
- type DecodingFctEmbedErrors = String -> UStringWithErrors
- utf8ToUnicode :: DecodingFct
- utf8ToUnicodeEmbedErrors :: DecodingFctEmbedErrors
- latin1ToUnicode :: String -> UString
- ucs2ToUnicode :: String -> UString
- ucs2BigEndianToUnicode :: String -> UString
- ucs2LittleEndianToUnicode :: String -> UString
- utf16beToUnicode :: String -> UString
- utf16leToUnicode :: String -> UString
- unicodeCharToUtf8 :: Unicode -> UTF8String
- unicodeToUtf8 :: UString -> UTF8String
- unicodeToXmlEntity :: UString -> String
- unicodeToLatin1 :: UString -> String
- unicodeRemoveNoneAscii :: UString -> String
- unicodeRemoveNoneLatin1 :: UString -> String
- intToCharRef :: Int -> String
- intToCharRefHex :: Int -> String
- intToHexString :: Int -> String
- getDecodingFct :: String -> Maybe DecodingFct
- getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors
- getOutputEncodingFct :: String -> Maybe (String -> UString)
- normalizeNL :: String -> String
- guessEncoding :: String -> String
- getOutputEncodingFct' :: String -> Maybe (Char -> StringFct)
- unicodeCharToUtf8' :: Char -> StringFct
- unicodeCharToXmlEntity' :: Char -> StringFct
- unicodeCharToLatin1' :: Char -> StringFct
Unicode Type declarations
Unicode is represented as the Char type Precondition for this is the support of Unicode character range in the compiler (e.g. ghc but not hugs)
type UTF8String = String Source
UTF-8 strings are implemented as Haskell strings
type UStringWithErrors = [Either String Char] Source
type DecodingFct = String -> (UString, [String]) Source
Decoding function with a pair containing the result string and a list of decoding errors as result
type DecodingFctEmbedErrors = String -> UStringWithErrors Source
Decoding function where decoding errors are interleaved with decoded characters
utf8ToUnicode :: DecodingFct Source
UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1
latin1ToUnicode :: String -> UString Source
code conversion from latin1 to Unicode
ucs2ToUnicode :: String -> UString Source
UCS-2 to UTF-8 conversion with byte order mark analysis
ucs2BigEndianToUnicode :: String -> UString Source
UCS-2 big endian to Unicode conversion
ucs2LittleEndianToUnicode :: String -> UString Source
UCS-2 little endian to Unicode conversion
utf16beToUnicode :: String -> UString Source
UTF-16 big endian to UTF-8 conversion with removal of byte order mark
utf16leToUnicode :: String -> UString Source
UTF-16 little endian to UTF-8 conversion with removal of byte order mark
unicodeCharToUtf8 :: Unicode -> UTF8String Source
conversion from Unicode (Char) to a UTF8 encoded string.
unicodeToUtf8 :: UString -> UTF8String Source
conversion from Unicode strings (UString) to UTF8 encoded strings.
unicodeToXmlEntity :: UString -> String Source
substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference.
This function can be used to translate all text nodes and attribute values into pure ascii.
see also : unicodeToLatin1
unicodeToLatin1 :: UString -> String Source
substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference.
This function can be used to translate all text nodes and attribute values into ISO latin1.
see also : unicodeToXmlEntity
unicodeRemoveNoneAscii :: UString -> String Source
removes all non ascii chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names
see also : unicodeRemoveNoneLatin1
, unicodeToXmlEntity
unicodeRemoveNoneLatin1 :: UString -> String Source
removes all non latin1 chars, may be used to transform a document into a pure ascii representation by removing all non ascii chars from tag and attibute names
see also : unicodeRemoveNoneAscii
, unicodeToLatin1
intToCharRef :: Int -> String Source
convert an Unicode into a XML character reference.
see also : intToCharRefHex
intToCharRefHex :: Int -> String Source
convert an Unicode into a XML hexadecimal character reference.
see also: intToCharRef
intToHexString :: Int -> String Source
getDecodingFct :: String -> Maybe DecodingFct Source
the lookup function for selecting the decoding function
getDecodingFctEmbedErrors :: String -> Maybe DecodingFctEmbedErrors Source
the lookup function for selecting the decoding function
getOutputEncodingFct :: String -> Maybe (String -> UString) Source
the lookup function for selecting the encoding function
normalizeNL :: String -> String Source
White Space (XML Standard 2.3) and end of line handling (2.11)
#x0D and #x0D#x0A are mapped to #x0A
guessEncoding :: String -> String Source
getOutputEncodingFct' :: String -> Maybe (Char -> StringFct) Source
the lookup function for selecting the encoding function
unicodeCharToUtf8' :: Char -> StringFct Source
conversion from Unicode (Char) to a UTF8 encoded string.
unicodeCharToXmlEntity' :: Char -> StringFct Source
substitute all Unicode characters, that are not legal 1-byte UTF-8 XML characters by a character reference.
unicodeCharToLatin1' :: Char -> StringFct Source
substitute all Unicode characters, that are not legal latin1 UTF-8 XML characters by a character reference.