Copyright	(c) Dong Han 2017-2018
License	BSD
Maintainer	winterland1989@gmail.com
Stability	experimental
Portability	non-portable
Safe Haskell	None
Language	Haskell2010

Z.Data.Text.Base

Contents

Text type
Building text
Basic creating
Conversion between list
Conversion between codepoint vector
Basic interface
Re-exports
Constants
- Locale
- Category
Misc

Description

A Text wrap a Bytes which will be interpreted using UTF-8 encoding. User should always use validate to construt a Text (instead of using construtor directly or coercing), otherwise illegal UTF-8 encoded codepoints will cause undefined behaviours.

Synopsis

newtype Text = Text {
- getUTF8Bytes :: Bytes
}
validate :: HasCallStack => Bytes -> Text
validateASCII :: HasCallStack => Bytes -> Text
validateMaybe :: Bytes -> Maybe Text
validateASCIIMaybe :: Bytes -> Maybe Text
index :: HasCallStack => Text -> Int -> Char
indexMaybe :: Text -> Int -> Maybe Char
charByteIndex :: Text -> Int -> Int
indexR :: HasCallStack => Text -> Int -> Char
indexMaybeR :: Text -> Int -> Maybe Char
charByteIndexR :: Text -> Int -> Int
empty :: Text
singleton :: Char -> Text
copy :: Text -> Text
replicate :: Int -> Char -> Text
cycleN :: Int -> Text -> Text
pack :: String -> Text
packN :: Int -> String -> Text
packR :: String -> Text
packRN :: Int -> String -> Text
unpack :: Text -> String
unpackR :: Text -> String
fromVector :: PrimVector Char -> Text
toVector :: Text -> PrimVector Char
null :: Text -> Bool
length :: Text -> Int
append :: Text -> Text -> Text
map' :: (Char -> Char) -> Text -> Text
imap' :: (Int -> Char -> Char) -> Text -> Text
foldl' :: (b -> Char -> b) -> b -> Text -> b
ifoldl' :: (b -> Int -> Char -> b) -> b -> Text -> b
foldr' :: (Char -> b -> b) -> b -> Text -> b
ifoldr' :: (Int -> Char -> b -> b) -> b -> Text -> b
concat :: [Text] -> Text
concatMap :: (Char -> Text) -> Text -> Text
count :: Char -> Text -> Int
all :: (Char -> Bool) -> Text -> Bool
any :: (Char -> Bool) -> Text -> Bool
displayWidth :: Text -> Int
displayWidthChar :: Char -> Int
data NormalizationResult
- = NormalizedYes
- | NormalizedMaybe
- | NormalizedNo
data NormalizeMode
- = NFC
- | NFKC
- | NFD
- | NFKD
isNormalized :: Text -> NormalizationResult
isNormalizedTo :: NormalizeMode -> Text -> NormalizationResult
normalize :: Text -> Text
normalizeTo :: NormalizeMode -> Text -> Text
envLocale :: IO Locale
caseFold :: Text -> Text
caseFoldWith :: Locale -> Text -> Text
toLower :: Text -> Text
toLowerWith :: Locale -> Text -> Text
toUpper :: Text -> Text
toUpperWith :: Locale -> Text -> Text
toTitle :: Text -> Text
toTitleWith :: Locale -> Text -> Text
isCategory :: Category -> Text -> Bool
spanCategory :: Category -> Text -> (Text, Text)
collate :: Collator -> Text -> Text -> Ordering
data Collator
collatorFor :: Lang -> Collator
collator :: QuasiQuoter
setUpperBeforeLower :: Bool -> Collator -> Collator
setFrenchAccents :: Bool -> Collator -> Collator
setNormalization :: Bool -> Collator -> Collator
setVariableWeighting :: VariableWeighting -> Collator -> Collator
collatorLang :: Collator -> Maybe Lang
rootCollator :: Collator
renderSortKey :: SortKey -> String
data VariableWeighting
- = NonIgnorable
- | Blanked
- | Shifted
- | ShiftTrimmed
data CollatorOptions = CollatorOptions {
- optLang :: Maybe Lang
- optVariableWeighting :: VariableWeighting
- optFrenchAccents :: Bool
- optUpperBeforeLower :: Bool
- optNormalize :: Bool
}
newtype SortKey = SortKey [Word16]
data Collator
tailorings :: [(Lang, Collation)]
parseLang :: Text -> Either String Lang
renderLang :: Lang -> Text
lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a)
data Lang = Lang {
- langLanguage :: Text
- langScript :: Maybe Text
- langRegion :: Maybe Text
- langVariants :: [Text]
- langExtensions :: [(Text, [(Text, Text)])]
- langPrivateUse :: [Text]
}
type Locale = CSize
pattern LocaleDefault :: Locale
pattern LocaleLithuanian :: Locale
pattern LocaleTurkishAndAzeriLatin :: Locale
type Category = CSize
pattern CategoryLetterUppercase :: Category
pattern CategoryLetterLowercase :: Category
pattern CategoryLetterTitlecase :: Category
pattern CategoryLetterOther :: Category
pattern CategoryLetter :: Category
pattern CategoryCaseMapped :: Category
pattern CategoryMarkNonSpacing :: Category
pattern CategoryMarkSpacing :: Category
pattern CategoryMarkEnclosing :: Category
pattern CategoryMark :: Category
pattern CategoryNumberDecimal :: Category
pattern CategoryNumberLetter :: Category
pattern CategoryNumberOther :: Category
pattern CategoryNumber :: Category
pattern CategoryPunctuationConnector :: Category
pattern CategoryPunctuationDash :: Category
pattern CategoryPunctuationOpen :: Category
pattern CategoryPunctuationClose :: Category
pattern CategoryPunctuationInitial :: Category
pattern CategoryPunctuationFinal :: Category
pattern CategoryPunctuationOther :: Category
pattern CategoryPunctuation :: Category
pattern CategorySymbolMath :: Category
pattern CategorySymbolCurrency :: Category
pattern CategorySymbolModifier :: Category
pattern CategorySymbolOther :: Category
pattern CategorySymbol :: Category
pattern CategorySeparatorSpace :: Category
pattern CategorySeparatorLine :: Category
pattern CategorySeparatorParagraph :: Category
pattern CategorySeparator :: Category
pattern CategoryControl :: Category
pattern CategoryFormat :: Category
pattern CategorySurrogate :: Category
pattern CategoryPrivateUse :: Category
pattern CategoryUnassigned :: Category
pattern CategoryCompatibility :: Category
pattern CategoryIgnoreGraphemeCluster :: Category
pattern CategoryIscntrl :: Category
pattern CategoryIsprint :: Category
pattern CategoryIsspace :: Category
pattern CategoryIsblank :: Category
pattern CategoryIsgraph :: Category
pattern CategoryIspunct :: Category
pattern CategoryIsalnum :: Category
pattern CategoryIsalpha :: Category
pattern CategoryIsupper :: Category
pattern CategoryIslower :: Category
pattern CategoryIsdigit :: Category
pattern CategoryIsxdigit :: Category
data TextException
- = InvalidUTF8Exception CallStack
- | InvalidASCIIException CallStack
- | IndexOutOfTextRange Int CallStack
- | EmptyText CallStack
errorEmptyText :: HasCallStack => a
c_utf8_validate_ba :: ByteArray# -> Int# -> Int# -> Int
c_utf8_validate_addr :: Addr# -> Int -> IO Int
c_ascii_validate_ba :: ByteArray# -> Int# -> Int# -> Int
c_ascii_validate_addr :: Addr# -> Int -> IO Int

Text type

newtype Text Source #

Text represented as UTF-8 encoded Bytes

Constructors

Text
Fields getUTF8Bytes :: Bytes Extract UTF-8 encoded `Bytes` from `Text`

Instances

Instances details

IsList Text Source #
Instance details Defined in Z.Data.Text.Base Associated Types type Item Text # Methods fromList :: [Item Text] -> Text # fromListN :: Int -> [Item Text] -> Text # toList :: Text -> [Item Text] #
Eq Text Source #
Instance details Defined in Z.Data.Text.Base Methods (==) :: Text -> Text -> Bool # (/=) :: Text -> Text -> Bool #
Ord Text Source #
Instance details Defined in Z.Data.Text.Base Methods compare :: Text -> Text -> Ordering # (<) :: Text -> Text -> Bool # (<=) :: Text -> Text -> Bool # (>) :: Text -> Text -> Bool # (>=) :: Text -> Text -> Bool # max :: Text -> Text -> Text # min :: Text -> Text -> Text #
Read Text Source #	Accepted syntax and escaping rules are same with `String`, which is different from `Show` instance.
Instance details Defined in Z.Data.Text.Base Methods readsPrec :: Int -> ReadS Text # readList :: ReadS [Text] # readPrec :: ReadPrec Text # readListPrec :: ReadPrec [Text] #
Show Text Source #	The escaping rules is different from `String` 's `Show` instance, see "Z.Data.Text.Builder.escapeTextJSON"
Instance details Defined in Z.Data.Text.Base Methods showsPrec :: Int -> Text -> ShowS # show :: Text -> String # showList :: [Text] -> ShowS #
IsString Text Source #
Instance details Defined in Z.Data.Text.Base Methods fromString :: String -> Text #
Semigroup Text Source #
Instance details Defined in Z.Data.Text.Base Methods (<>) :: Text -> Text -> Text # sconcat :: NonEmpty Text -> Text # stimes :: Integral b => b -> Text -> Text #
Monoid Text Source #
Instance details Defined in Z.Data.Text.Base Methods mempty :: Text # mappend :: Text -> Text -> Text # mconcat :: [Text] -> Text #
Arbitrary Text Source #
Instance details Defined in Z.Data.Text.Base Methods arbitrary :: Gen Text # shrink :: Text -> [Text] #
CoArbitrary Text Source #
Instance details Defined in Z.Data.Text.Base Methods coarbitrary :: Text -> Gen b -> Gen b #
FoldCase Text Source #	case fold with default locale.
Instance details Defined in Z.Data.Text.Base Methods foldCase :: Text -> Text # foldCaseList :: [Text] -> [Text]
NFData Text Source #
Instance details Defined in Z.Data.Text.Base Methods rnf :: Text -> () #
Hashable Text Source #
Instance details Defined in Z.Data.Text.Base Methods hashWithSalt :: Int -> Text -> Int # hash :: Text -> Int #
Print Text Source #	The escaping rules is same with `Show` instance: we reuse JSON escaping rules here, so it will be faster.
Instance details Defined in Z.Data.Text.Print Methods toUTF8BuilderP :: Int -> Text -> Builder () Source #
JSON Text Source #
Instance details Defined in Z.Data.JSON.Base Methods fromValue :: Value -> Converter Text Source # toValue :: Text -> Value Source # encodeJSON :: Text -> Builder () Source #
JSON a => JSON (Map Text a) Source #
Instance details Defined in Z.Data.JSON.Base Methods fromValue :: Value -> Converter (Map Text a) Source # toValue :: Map Text a -> Value Source # encodeJSON :: Map Text a -> Builder () Source #
JSON a => JSON (HashMap Text a) Source #	default instance prefer later key
Instance details Defined in Z.Data.JSON.Base Methods fromValue :: Value -> Converter (HashMap Text a) Source # toValue :: HashMap Text a -> Value Source # encodeJSON :: HashMap Text a -> Builder () Source #
JSON a => JSON (FlatMap Text a) Source #	default instance prefer later key
Instance details Defined in Z.Data.JSON.Base Methods fromValue :: Value -> Converter (FlatMap Text a) Source # toValue :: FlatMap Text a -> Value Source # encodeJSON :: FlatMap Text a -> Builder () Source #
type Item Text Source #
Instance details Defined in Z.Data.Text.Base type Item Text = Char

Building text

validate :: HasCallStack => Bytes -> Text Source #

O(n) Validate a sequence of bytes is UTF-8 encoded.

Throw InvalidUTF8Exception in case of invalid codepoint.

validateASCII :: HasCallStack => Bytes -> Text Source #

O(n) Validate a sequence of bytes is all ascii char byte(<128).

Throw InvalidASCIIException in case of invalid byte, It's not always faster than validate, use it only if you want to validate ASCII char sequences.

validateMaybe :: Bytes -> Maybe Text Source #

O(n) Validate a sequence of bytes is UTF-8 encoded.

Return Nothing in case of invalid codepoint.

validateASCIIMaybe :: Bytes -> Maybe Text Source #

O(n) Validate a sequence of bytes is all ascii char byte(<128).

Return Nothing in case of invalid byte.

index :: HasCallStack => Text -> Int -> Char Source #

O(n) Get the nth codepoint from Text, throw IndexOutOfTextRange when out of bound.

indexMaybe :: Text -> Int -> Maybe Char Source #

O(n) Get the nth codepoint from Text.

charByteIndex :: Text -> Int -> Int Source #

O(n) Find the nth codepoint's byte index (pointing to the nth char's begining byte).

The index is only meaningful to the whole byte slice, if there's less than n codepoints, the index will point to next byte after the end.

indexR :: HasCallStack => Text -> Int -> Char Source #

O(n) Get the nth codepoint from Text counting from the end, throw IndexOutOfVectorRange n callStack when out of bound.

indexMaybeR :: Text -> Int -> Maybe Char Source #

O(n) Get the nth codepoint from Text counting from the end.

charByteIndexR :: Text -> Int -> Int Source #

O(n) Find the nth codepoint's byte index from the end (pointing to the previous char's ending byte).

The index is only meaningful to the whole byte slice, if there's less than n codepoints, the index will point to previous byte before the start.

Basic creating

empty :: Text Source #

O(1). Empty text.

singleton :: Char -> Text Source #

O(1). Single char text.

copy :: Text -> Text Source #

O(n). Copy a text from slice.

replicate :: Int -> Char -> Text Source #

O(n) replicate char n time.

cycleN :: Int -> Text -> Text Source #

O(n*m) cycleN a text n times.

Conversion between list

pack :: String -> Text Source #

O(n) Convert a string into a text

Alias for packN defaultInitSize, will be rewritten to a memcpy if possible.

packN :: Int -> String -> Text Source #

O(n) Convert a list into a text with an approximate size(in bytes, not codepoints).

If the encoded bytes length is larger than the size given, we simply double the buffer size and continue building.

This function is a good consumer in the sense of build/foldr fusion.

packR :: String -> Text Source #

O(n) Alias for packRN defaultInitSize.

packRN :: Int -> String -> Text Source #

O(n) packN in reverse order.

This function is a good consumer in the sense of build/foldr fusion.

unpack :: Text -> String Source #

O(n) Convert text to a char list.

Unpacking is done lazily. i.e. we will retain reference to the array until all element are consumed.

This function is a good producer in the sense of build/foldr fusion.

unpackR :: Text -> String Source #

O(n) Convert text to a list in reverse order.

This function is a good producer in the sense of build/foldr fusion.

Conversion between codepoint vector

fromVector :: PrimVector Char -> Text Source #

O(n) convert from a char vector.

toVector :: Text -> PrimVector Char Source #

O(n) convert to a char vector.

Basic interface

null :: Text -> Bool Source #

O(1) Test whether a text is empty.

length :: Text -> Int Source #

O(n) The char length of a text.

append :: Text -> Text -> Text Source #

O(m+n)

There's no need to guard empty vector because we guard them for you, so appending empty text are no-ops.

map' :: (Char -> Char) -> Text -> Text Source #

O(n) map f t is the Text obtained by applying f to each char of t. Performs replacement on invalid scalar values.

imap' :: (Int -> Char -> Char) -> Text -> Text Source #

Strict mapping with index.

foldl' :: (b -> Char -> b) -> b -> Text -> b Source #

Strict left to right fold.

ifoldl' :: (b -> Int -> Char -> b) -> b -> Text -> b Source #

Strict left to right fold with index.

foldr' :: (Char -> b -> b) -> b -> Text -> b Source #

Strict right to left fold

ifoldr' :: (Int -> Char -> b -> b) -> b -> Text -> b Source #

Strict right to left fold with index

NOTE: the index is counting from 0, not backwards

concat :: [Text] -> Text Source #

O(n) Concatenate a list of text.

Note: concat have to force the entire list to filter out empty text and calculate the length for allocation.

concatMap :: (Char -> Text) -> Text -> Text Source #

Map a function over a text and concatenate the results

Special folds

count :: Char -> Text -> Int Source #

O(n) count returns count of an element from a text.

all :: (Char -> Bool) -> Text -> Bool Source #

O(n) Applied to a predicate and text, all determines if all chars of the text satisfy the predicate.

any :: (Char -> Bool) -> Text -> Bool Source #

O(n) Applied to a predicate and a text, any determines if any chars of the text satisfy the predicate.

Text display width

displayWidth :: Text -> Int Source #

Get the display width of a piece of text.

You shouldn't pass texts with control characters(<0x20, \DEL), which are counted with -1 width.

>>> displayWidth "你好世界！"
>>> 10
>>> displayWidth "hello world!"
>>> 12

displayWidthChar :: Char -> Int Source #

Get the display width of a Char.

You shouldn't pass texts with control characters(<0x20, \DEL), which are counted with -1 width.

normalization

data NormalizationResult Source #

Constructors

NormalizedYes
NormalizedMaybe
NormalizedNo

Instances

Instances details

Eq NormalizationResult Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Methods (==) :: NormalizationResult -> NormalizationResult -> Bool # (/=) :: NormalizationResult -> NormalizationResult -> Bool #
Ord NormalizationResult Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Methods compare :: NormalizationResult -> NormalizationResult -> Ordering # (<) :: NormalizationResult -> NormalizationResult -> Bool # (<=) :: NormalizationResult -> NormalizationResult -> Bool # (>) :: NormalizationResult -> NormalizationResult -> Bool # (>=) :: NormalizationResult -> NormalizationResult -> Bool # max :: NormalizationResult -> NormalizationResult -> NormalizationResult # min :: NormalizationResult -> NormalizationResult -> NormalizationResult #
Show NormalizationResult Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Methods showsPrec :: Int -> NormalizationResult -> ShowS # show :: NormalizationResult -> String # showList :: [NormalizationResult] -> ShowS #
Generic NormalizationResult Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Associated Types type Rep NormalizationResult :: Type -> Type # Methods from :: NormalizationResult -> Rep NormalizationResult x # to :: Rep NormalizationResult x -> NormalizationResult #
type Rep NormalizationResult Source #
Instance details Defined in Z.Data.Text.UTF8Rewind type Rep NormalizationResult = D1 ('MetaData "NormalizationResult" "Z.Data.Text.UTF8Rewind" "Z-Data-0.9.0.0-9CZLncR3XfBEnMnIuwgUs7" 'False) (C1 ('MetaCons "NormalizedYes" 'PrefixI 'False) (U1 :: Type -> Type) :+: (C1 ('MetaCons "NormalizedMaybe" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "NormalizedNo" 'PrefixI 'False) (U1 :: Type -> Type)))

data NormalizeMode Source #

These are the Unicode Normalization Forms:

Form                         | Description
---------------------------- | ---------------------------------------------
Normalization Form D (NFD)   | Canonical decomposition
Normalization Form C (NFC)   | Canonical decomposition, followed by canonical composition
Normalization Form KD (NFKD) | Compatibility decomposition
Normalization Form KC (NFKC) | Compatibility decomposition, followed by canonical composition

Constructors

NFC
NFKC
NFD
NFKD

Instances

Instances details

Eq NormalizeMode Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Methods (==) :: NormalizeMode -> NormalizeMode -> Bool # (/=) :: NormalizeMode -> NormalizeMode -> Bool #
Ord NormalizeMode Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Methods compare :: NormalizeMode -> NormalizeMode -> Ordering # (<) :: NormalizeMode -> NormalizeMode -> Bool # (<=) :: NormalizeMode -> NormalizeMode -> Bool # (>) :: NormalizeMode -> NormalizeMode -> Bool # (>=) :: NormalizeMode -> NormalizeMode -> Bool # max :: NormalizeMode -> NormalizeMode -> NormalizeMode # min :: NormalizeMode -> NormalizeMode -> NormalizeMode #
Show NormalizeMode Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Methods showsPrec :: Int -> NormalizeMode -> ShowS # show :: NormalizeMode -> String # showList :: [NormalizeMode] -> ShowS #
Generic NormalizeMode Source #
Instance details Defined in Z.Data.Text.UTF8Rewind Associated Types type Rep NormalizeMode :: Type -> Type # Methods from :: NormalizeMode -> Rep NormalizeMode x # to :: Rep NormalizeMode x -> NormalizeMode #
type Rep NormalizeMode Source #
Instance details Defined in Z.Data.Text.UTF8Rewind type Rep NormalizeMode = D1 ('MetaData "NormalizeMode" "Z.Data.Text.UTF8Rewind" "Z-Data-0.9.0.0-9CZLncR3XfBEnMnIuwgUs7" 'False) ((C1 ('MetaCons "NFC" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "NFKC" 'PrefixI 'False) (U1 :: Type -> Type)) :+: (C1 ('MetaCons "NFD" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "NFKD" 'PrefixI 'False) (U1 :: Type -> Type)))

isNormalized :: Text -> NormalizationResult Source #

Check if a string is stable in the NFC (Normalization Form C).

isNormalizedTo :: NormalizeMode -> Text -> NormalizationResult Source #

Check if a string is stable in the specified Unicode Normalization Form.

This function can be used as a preprocessing step, before attempting to normalize a string. Normalization is a very expensive process, it is often cheaper to first determine if the string is unstable in the requested normalization form.

The result of the check will be YES if the string is stable and MAYBE or NO if it is unstable. If the result is MAYBE, the string does not necessarily have to be normalized.

For more information, please review <http://www.unicode.org/reports/tr15/ Unicode Standard Annex #15 - Unicode Normalization Forms>.

normalize :: Text -> Text Source #

Normalize a string to NFC (Normalization Form C).

normalizeTo :: NormalizeMode -> Text -> Text Source #

Normalize a string to the specified Unicode Normalization Form.

The Unicode standard defines two standards for equivalence between characters: canonical and compatibility equivalence. Canonically equivalent characters and sequence represent the same abstract character and must be rendered with the same appearance and behavior. Compatibility equivalent characters have a weaker equivalence and may be rendered differently.

Unicode Normalization Forms are formally defined standards that can be used to test whether any two strings of characters are equivalent to each other. This equivalence may be canonical or compatibility.

The algorithm puts all combining marks into a specified order and uses the rules for decomposition and composition to transform the string into one of four Unicode Normalization Forms. A binary comparison can then be used to determine equivalence.

Case conversion

envLocale :: IO Locale Source #

Get environment locale

caseFold :: Text -> Text Source #

Remove case distinction from UTF-8 encoded text with default locale.

caseFoldWith :: Locale -> Text -> Text Source #

Remove case distinction from UTF-8 encoded text.

Case folding is the process of eliminating differences between code points concerning case mapping. It is most commonly used for comparing strings in a case-insensitive manner. Conversion is fully compliant with the Unicode 7.0 standard.

Although similar to lowercasing text, there are significant differences. For one, case folding does _not_ take locale into account when converting. In some cases, case folding can be up to 20% faster than lowercasing the same text, but the result cannot be treated as correct lowercased text.

Only two locale-specific exception are made when case folding text. In Turkish, U+0049 LATIN CAPITAL LETTER I maps to U+0131 LATIN SMALL LETTER DOTLESS I and U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE maps to U+0069 LATIN SMALL LETTER I.

Although most code points can be case folded without changing length, there are notable exceptions. For example, U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE) maps to "U+0069 U+0307" (LATIN SMALL LETTER I and COMBINING DOT ABOVE) when converted to lowercase.

Only a handful of scripts make a distinction between upper- and lowercase. In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic, a few historic or archaic scripts have case. The vast majority of scripts do not have case distinctions.

toLower :: Text -> Text Source #

Convert UTF-8 encoded text to lowercase with default locale.

toLowerWith :: Locale -> Text -> Text Source #

Convert UTF-8 encoded text to lowercase.

This function allows conversion of UTF-8 encoded strings to lowercase without first changing the encoding to UTF-32. Conversion is fully compliant with the Unicode 7.0 standard.

Although most code points can be converted to lowercase with changing length, there are notable exceptions. For example, U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE) maps to "U+0069 U+0307" (LATIN SMALL LETTER I and COMBINING DOT ABOVE) when converted to lowercase.

Only a handful of scripts make a distinction between upper- and lowercase. In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic, a few historic or archaic scripts have case. The vast majority of scripts do not have case distinctions.

Case mapping is not reversible. That is, toUpper(toLower(x)) != toLower(toUpper(x)).

Certain code points (or combinations of code points) apply rules based on the locale. For more information about these exceptional code points, please refer to the Unicode standard: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt

toUpper :: Text -> Text Source #

Convert UTF-8 encoded text to uppercase with default locale.

toUpperWith :: Locale -> Text -> Text Source #

Convert UTF-8 encoded text to uppercase.

Conversion is fully compliant with the Unicode 7.0 standard.

Although most code points can be converted without changing length, there are notable exceptions. For example, U+00DF (LATIN SMALL LETTER SHARP S) maps to "U+0053 U+0053" (LATIN CAPITAL LETTER S and LATIN CAPITAL LETTER S) when converted to uppercase.

Only a handful of scripts make a distinction between upper and lowercase. In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic, a few historic or archaic scripts have case. The vast majority of scripts do not have case distinctions.

Case mapping is not reversible. That is, toUpper(toLower(x)) != toLower(toUpper(x)).

Certain code points (or combinations of code points) apply rules based on the locale. For more information about these exceptional code points, please refer to the Unicode standard: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt

toTitle :: Text -> Text Source #

Convert UTF-8 encoded text to titlecase with default locale.

toTitleWith :: Locale -> Text -> Text Source #

Convert UTF-8 encoded text to titlecase.

This function allows conversion of UTF-8 encoded strings to titlecase. Conversion is fully compliant with the Unicode 7.0 standard.

Titlecase requires a bit more explanation than uppercase and lowercase, because it is not a common text transformation. Titlecase uses uppercase for the first letter of each word and lowercase for the rest. Words are defined as "collections of code points with general category Lu, Ll, Lt, Lm or Lo according to the Unicode database".

Effectively, any type of punctuation can break up a word, even if this is not grammatically valid. This happens because the titlecasing algorithm does not and cannot take grammar rules into account.

Text                                 | Titlecase
-------------------------------------|-------------------------------------
The running man                      | The Running Man
NATO Alliance                        | Nato Alliance
You're amazing at building libraries | You'Re Amazing At Building Libraries

Although most code points can be converted to titlecase without changing length, there are notable exceptions. For example, U+00DF (LATIN SMALL LETTER SHARP S) maps to "U+0053 U+0073" (LATIN CAPITAL LETTER S and LATIN SMALL LETTER S) when converted to titlecase.

Certain code points (or combinations of code points) apply rules based on the locale. For more information about these exceptional code points, please refer to the Unicode standard: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt

Unicode category

isCategory :: Category -> Text -> Bool Source #

Check if the input string conforms to the category specified by the flags.

This function can be used to check if the code points in a string are part of a category. Valid flags are members of the "list of categories". The category for a code point is defined as part of the entry in UnicodeData.txt, the data file for the Unicode code point database.

By default, the function will treat grapheme clusters as a single code point. This means that the following string:

Code point | Canonical combining class | General category      | Name
---------- | ------------------------- | --------------------- | ----------------------
U+0045     | 0                         | Lu (Uppercase letter) | LATIN CAPITAL LETTER E
U+0300     | 230                       | Mn (Non-spacing mark) | COMBINING GRAVE ACCENT

Will match with CategoryLetterUppercase in its entirety, because the COMBINING GRAVE ACCENT is treated as part of the grapheme cluster. This is useful when e.g. creating a text parser, because you do not have to normalize the text first.

If this is undesired behavior, specify the CategoryIgnoreGraphemeCluster flag.

In order to maintain backwards compatibility with POSIX functions like isdigit and isspace, compatibility flags have been provided. Note, however, that the result is only guaranteed to be correct for code points in the Basic Latin range, between U+0000 and 0+007F. Combining a compatibility flag with a regular category flag will result in undefined behavior.

spanCategory :: Category -> Text -> (Text, Text) Source #

Try to match as many code points with the matching category flags as possible and return the prefix and suffix.

Collate

collate :: Collator -> Text -> Text -> Ordering Source #

Compare two Texts with Unicode Collation Algorithm

data Collator #

Instances

Instances details

IsString Collator
Instance details Defined in Text.Collate.Collator Methods fromString :: String -> Collator #

Re-exports

collatorFor :: Lang -> Collator #

Returns a collator based on a BCP 47 language tag. If no exact match is found, we try to find the best match (falling back to the root collation if nothing else succeeds). If something other than the default collation for a language is desired, the co keyword of the unicode extensions can be used (e.g. es-u-co-trad for traditional Spanish). Other unicode extensions affect the collator options:

The kb keyword has the same effect as setFrenchAccents (e.g. fr-FR-u-kb-true).
The ka keyword has the same effect as setVariableWeight (e.g. fr-FR-u-kb-ka-shifted or en-u-ka-noignore).
The kf keyword has the same effect as setUpperBeforeLower (e.g. fr-u-kf-upper or fr-u-kf-lower).
The kk keyword has the same effect as setNormalization (e.g. fr-u-kk-false).

collator :: QuasiQuoter #

Create a collator at compile time based on a BCP 47 language tag: e.g., [collator|es-u-co-trad|]. Requires the QuasiQuotes extension.

setUpperBeforeLower :: Bool -> Collator -> Collator #

Most collations default to sorting lowercase letters before uppercase (exceptions: mt, da, cu). To select the opposite behavior, use setUpperBeforeLower True.

setFrenchAccents :: Bool -> Collator -> Collator #

setFrenchAccents True causes secondary weights to be scanned in reverse order, so we get the sorting cote côte coté côté instead of cote coté côte côté. The default is usually False, except for fr-CA where it is True.

setNormalization :: Bool -> Collator -> Collator #

The Unicode Collation Algorithm expects input to be normalized into its canonical decomposition (NFD). By default, collators perform this normalization. If your input is already normalized, you can increase performance by disabling this step: setNormalization False.

setVariableWeighting :: VariableWeighting -> Collator -> Collator #

Set method for handling variable elements (punctuation and spaces): see http://www.unicode.org/reports/tr10/, Tables 11 and 12.

collatorLang :: Collator -> Maybe Lang #

Lang used for tailoring. Because of fallback rules, this may be somewhat different from the Lang passed to collatorFor. This Lang won't contain unicode extensions used to set options, but it will specify the collation if a non-default collation is being used.

rootCollator :: Collator #

Default collator based on DUCET table (allkeys.txt).

renderSortKey :: SortKey -> String #

Render sort key in the manner used in the CLDR collation test data: the character '|' is used to separate the levels of the key and corresponds to a 0 in the actual sort key.

data VariableWeighting #

VariableWeighting affects how punctuation is treated. See http://www.unicode.org/reports/tr10/#Variable_Weighting.

Constructors

NonIgnorable	Don't ignore punctuation (Deluge < deluge-)
Blanked	Completely ignore punctuation (Deluge = deluge-)
Shifted	Consider punctuation at lower priority (de-luge < delu-ge < deluge < deluge- < Deluge)
ShiftTrimmed	Variant of Shifted (deluge < de-luge < delu-ge)

Instances

Instances details

Eq VariableWeighting
Instance details Defined in Text.Collate.Collator Methods (==) :: VariableWeighting -> VariableWeighting -> Bool # (/=) :: VariableWeighting -> VariableWeighting -> Bool #
Ord VariableWeighting
Instance details Defined in Text.Collate.Collator Methods compare :: VariableWeighting -> VariableWeighting -> Ordering # (<) :: VariableWeighting -> VariableWeighting -> Bool # (<=) :: VariableWeighting -> VariableWeighting -> Bool # (>) :: VariableWeighting -> VariableWeighting -> Bool # (>=) :: VariableWeighting -> VariableWeighting -> Bool # max :: VariableWeighting -> VariableWeighting -> VariableWeighting # min :: VariableWeighting -> VariableWeighting -> VariableWeighting #
Show VariableWeighting
Instance details Defined in Text.Collate.Collator Methods showsPrec :: Int -> VariableWeighting -> ShowS # show :: VariableWeighting -> String # showList :: [VariableWeighting] -> ShowS #

data CollatorOptions #

Constructors

CollatorOptions

Fields

optLang :: Maybe Lang
Lang used for tailoring. Note that because of fallback rules, this may be somewhat different from the Lang passed to collatorFor. This Lang won't contain unicode extensions used to set options, but it will specify the collation if a non-default collation is being used.
optVariableWeighting :: VariableWeighting
Method for handling variable elements (see http://www.unicode.org/reports/tr10/, Tables 11 and 12).
optFrenchAccents :: Bool
If True, secondary weights are scanned in reverse order, so we get the sorting "cote côte coté côté" instead of "cote coté côte côté"
optUpperBeforeLower :: Bool
Sort uppercase letters before lower
optNormalize :: Bool
If True, strings are normalized to NFD before collation elements are constructed. If the input is already normalized, this option can be set to False for better performance.

Instances

Instances details

Eq CollatorOptions
Instance details Defined in Text.Collate.Collator Methods (==) :: CollatorOptions -> CollatorOptions -> Bool # (/=) :: CollatorOptions -> CollatorOptions -> Bool #
Ord CollatorOptions
Instance details Defined in Text.Collate.Collator Methods compare :: CollatorOptions -> CollatorOptions -> Ordering # (<) :: CollatorOptions -> CollatorOptions -> Bool # (<=) :: CollatorOptions -> CollatorOptions -> Bool # (>) :: CollatorOptions -> CollatorOptions -> Bool # (>=) :: CollatorOptions -> CollatorOptions -> Bool # max :: CollatorOptions -> CollatorOptions -> CollatorOptions # min :: CollatorOptions -> CollatorOptions -> CollatorOptions #
Show CollatorOptions
Instance details Defined in Text.Collate.Collator Methods showsPrec :: Int -> CollatorOptions -> ShowS # show :: CollatorOptions -> String # showList :: [CollatorOptions] -> ShowS #

newtype SortKey #

Constructors

SortKey [Word16]

Instances

Instances details

Eq SortKey
Instance details Defined in Text.Collate.Collator Methods (==) :: SortKey -> SortKey -> Bool # (/=) :: SortKey -> SortKey -> Bool #
Ord SortKey
Instance details Defined in Text.Collate.Collator Methods compare :: SortKey -> SortKey -> Ordering # (<) :: SortKey -> SortKey -> Bool # (<=) :: SortKey -> SortKey -> Bool # (>) :: SortKey -> SortKey -> Bool # (>=) :: SortKey -> SortKey -> Bool # max :: SortKey -> SortKey -> SortKey # min :: SortKey -> SortKey -> SortKey #
Show SortKey
Instance details Defined in Text.Collate.Collator Methods showsPrec :: Int -> SortKey -> ShowS # show :: SortKey -> String # showList :: [SortKey] -> ShowS #

data Collator #

Instances

Instances details

IsString Collator
Instance details Defined in Text.Collate.Collator Methods fromString :: String -> Collator #

tailorings :: [(Lang, Collation)] #

An association list matching Langs with tailored Collations.

parseLang :: Text -> Either String Lang #

Parse a BCP 47 language tag as a Lang.

renderLang :: Lang -> Text #

Render a Lang in BCP 47 form.

lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a) #

Find best match for a Lang in an association list.

data Lang #

Represents a BCP 47 language tag (https://tools.ietf.org/html/bcp47).

Constructors

Lang
Fields langLanguage :: Text langScript :: Maybe Text langRegion :: Maybe Text langVariants :: [Text] langExtensions :: [(Text, [(Text, Text)])] langPrivateUse :: [Text]

Instances

Instances details

Eq Lang
Instance details Defined in Text.Collate.Lang Methods (==) :: Lang -> Lang -> Bool # (/=) :: Lang -> Lang -> Bool #
Ord Lang
Instance details Defined in Text.Collate.Lang Methods compare :: Lang -> Lang -> Ordering # (<) :: Lang -> Lang -> Bool # (<=) :: Lang -> Lang -> Bool # (>) :: Lang -> Lang -> Bool # (>=) :: Lang -> Lang -> Bool # max :: Lang -> Lang -> Lang # min :: Lang -> Lang -> Lang #
Show Lang
Instance details Defined in Text.Collate.Lang Methods showsPrec :: Int -> Lang -> ShowS # show :: Lang -> String # showList :: [Lang] -> ShowS #
IsString Lang
Instance details Defined in Text.Collate.Lang Methods fromString :: String -> Lang #
Binary Lang
Instance details Defined in Text.Collate.Lang Methods put :: Lang -> Put # get :: Get Lang # putList :: [Lang] -> Put #
Lift Lang
Instance details Defined in Text.Collate.Lang Methods lift :: Lang -> Q Exp # liftTyped :: Lang -> Q (TExp Lang) #

Constants

Locale

type Locale = CSize Source #

Locale for case mapping.

pattern LocaleDefault :: Locale Source #

pattern LocaleLithuanian :: Locale Source #

pattern LocaleTurkishAndAzeriLatin :: Locale Source #

Misc

data TextException Source #

Constructors

InvalidUTF8Exception CallStack
InvalidASCIIException CallStack
IndexOutOfTextRange Int CallStack	first payload is invalid char index
EmptyText CallStack

Instances

Instances details

Show TextException Source #
Instance details Defined in Z.Data.Text.Base Methods showsPrec :: Int -> TextException -> ShowS # show :: TextException -> String # showList :: [TextException] -> ShowS #
Exception TextException Source #
Instance details Defined in Z.Data.Text.Base Methods toException :: TextException -> SomeException # fromException :: SomeException -> Maybe TextException # displayException :: TextException -> String #

errorEmptyText :: HasCallStack => a Source #

c_utf8_validate_ba :: ByteArray# -> Int# -> Int# -> Int Source #

c_utf8_validate_addr :: Addr# -> Int -> IO Int Source #

c_ascii_validate_ba :: ByteArray# -> Int# -> Int# -> Int Source #

c_ascii_validate_addr :: Addr# -> Int -> IO Int Source #