Copyright	(c) 2021 John MacFarlane
License	BSD-2-Clause
Maintainer	John MacFarlane <jgm@berkeley.edu>
Safe Haskell	Safe-Inferred
Language	Haskell2010

Text.Collate

Description

This library provides a pure Haskell implementation of the Unicode Collation Algorithm, allowing proper sorting of Unicode strings.

The simplest way to use the library is to use the IsString instance of Collator (together with the OverloadedStrings extension):

>>> import Data.List (sortBy)
>>> import qualified Data.Text.IO as T
>>> mapM_ T.putStrLn $ sortBy (collate "en-US") ["𝒶bc","abC","𝕒bc","Abc","abç","äbc"]
abC
𝒶bc
𝕒bc
Abc
abç
äbc

Note the difference from the default sort:

>>> import Data.List (sort)
>>> import qualified Data.Text.IO as T
>>> mapM_ T.putStrLn $ sort ["𝒶bc","abC","𝕒bc","Abc","abç","äbc"]
Abc
abC
abç
äbc
𝒶bc
𝕒bc

A Collator provides a function collate that compares two texts, and a function sortKey that returns the sort key. Most users will just need collate.

>>> let de = collatorFor "de"
>>> let se = collatorFor "se"
>>> collate de "ö" "z"
LT
>>> collate se "ö" "z"
GT
>>> sortKey de "ö"
SortKey [0x213C,0x0000,0x0020,0x002B,0x0000,0x0002,0x0002]
>>> sortKey se "ö"
SortKey [0x22FD,0x0000,0x0020,0x0000,0x0002]

To sort a string type other than Text, the function collateWithUnpacker may be used. It takes as a parameter a function that lazily unpacks the string type into a list of Char.

>>> let seCollateString = collateWithUnpacker "se" id
>>> seCollateString ("ö" :: String) ("z" :: String)
GT

Because Collator and Lang have IsString instances, you can just specify them using string literals, as in the above examples. Note, however, that you won't get any feedback if the string doesn't parse correctly as a BCP47 language tag, or if no collation is defined for the specified language; instead, you'll just get the default (root) collator. For this reason, we don't recommend relying on the IsString instance.

If you won't know the language until run time, use parseLang to parse it to a Lang, handling parse errors, and then pass the Lang to collatorFor.

>>> let handleParseError = error  -- or something fancier
>>> lang <- either handleParseError return $ parseLang "bs-Cyrl"
>>> collate (collatorFor lang) "a" "b"
LT

If you know the language at compile-time, use the collator quasi-quoter and you'll get compile-time errors and warnings:

>>> :set -XQuasiQuotes
>>> let esTraditional = [collator|es-u-co-trad|]
>>> let esStandard = [collator|es|]
>>> collate esStandard "Co" "Ch"
GT
>>> collate esTraditional "Co" "Ch"
LT

Note that the unicode extension syntax for BCP47 can be used to specify a particular collation for the language (here, Spanish "traditional" instead of the default ordering; the alias trad is used because of length limits for BCP47 keywords).

The extension syntax can also be used to set collator options. The keyword kb can be used to specify the "backwards" accent sorting that is sometimes used in French:

>>> collate "fr" "côte" "coté"
GT
>>> collate "fr-u-kb" "côte" "coté"
LT

The keyword ka can be used to specify the variable weighting options which affect how punctuation and whitespace are treated:

>>> collate "en-u-ka-shifted" "de-luge" "de Luge"
LT
>>> collate "en-u-ka-noignore" "de-luge" "de Luge"
GT

The keyword kk can be used to turn off the normalization step (which is required by the algorithm but can be omitted for better performance if the input is already in NFD form (canonical decomposition).

>>> let noNormalizeCollator = [collator|en-u-kk-false|]

The keyword kf can be used to say whether uppercase or lowercase letters should be sorted first.

>>> collate "en-u-kf-upper" "A" "a"
LT
>>> collate "en-u-kf-lower" "A" "a"
GT

These options be combined:

>>> collate "de-DE-u-co-phonebk-kb-false-ka-shifted" "Udet" "Über"
LT

Options can also be set using the functions setVariableWeighting, setNormalization, setUpperBeforeLower, and setFrenchAccents:

>>> let frC = setFrenchAccents True [collator|fr|]
>>> collate frC "côte" "coté"
LT

Synopsis

data Collator
collate :: Collator -> Text -> Text -> Ordering
collateWithUnpacker :: Collator -> forall a. Eq a => (a -> [Char]) -> a -> a -> Ordering
collatorFor :: Lang -> Collator
collator :: QuasiQuoter
rootCollator :: Collator
newtype SortKey = SortKey [Word16]
sortKey :: Collator -> Text -> SortKey
renderSortKey :: SortKey -> String
data VariableWeighting
- = NonIgnorable
- | Blanked
- | Shifted
- | ShiftTrimmed
data CollatorOptions = CollatorOptions {
- optLang :: Maybe Lang
- optVariableWeighting :: VariableWeighting
- optFrenchAccents :: Bool
- optUpperBeforeLower :: Bool
- optNormalize :: Bool
}
collatorOptions :: Collator -> CollatorOptions
collatorLang :: Collator -> Maybe Lang
setVariableWeighting :: VariableWeighting -> Collator -> Collator
setNormalization :: Bool -> Collator -> Collator
setFrenchAccents :: Bool -> Collator -> Collator
setUpperBeforeLower :: Bool -> Collator -> Collator
tailorings :: [(Lang, Collation)]
data Lang = Lang {
- langLanguage :: Text
- langScript :: Maybe Text
- langRegion :: Maybe Text
- langVariants :: [Text]
- langExtensions :: [(Text, [(Text, Text)])]
- langPrivateUse :: [Text]
}
lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a)
parseLang :: Text -> Either String Lang
renderLang :: Lang -> Text

Documentation

data Collator Source #

Instances

Instances details

IsString Collator Source #
Instance details Defined in Text.Collate.Collator Methods fromString :: String -> Collator #

collate :: Collator -> Text -> Text -> Ordering Source #

Compare two Texts

collateWithUnpacker :: Collator -> forall a. Eq a => (a -> [Char]) -> a -> a -> Ordering Source #

Compare two strings of any type that can be unpacked lazily into a list of Chars.

collatorFor :: Lang -> Collator Source #

Returns a collator based on a BCP 47 language tag. If no exact match is found, we try to find the best match (falling back to the root collation if nothing else succeeds). If something other than the default collation for a language is desired, the co keyword of the unicode extensions can be used (e.g. es-u-co-trad for traditional Spanish). Other unicode extensions affect the collator options:

The kb keyword has the same effect as setFrenchAccents (e.g. fr-FR-u-kb-true).
The ka keyword has the same effect as setVariableWeight (e.g. fr-FR-u-kb-ka-shifted or en-u-ka-noignore).
The kf keyword has the same effect as setUpperBeforeLower (e.g. fr-u-kf-upper or fr-u-kf-lower).
The kk keyword has the same effect as setNormalization (e.g. fr-u-kk-false).

collator :: QuasiQuoter Source #

Create a collator at compile time based on a BCP 47 language tag: e.g., [collator|es-u-co-trad|]. Requires the QuasiQuotes extension.

rootCollator :: Collator Source #

Default collator based on DUCET table (allkeys.txt).

newtype SortKey Source #

Constructors

SortKey [Word16]

Instances

Instances details

Show SortKey Source #
Instance details Defined in Text.Collate.Collator Methods showsPrec :: Int -> SortKey -> ShowS # show :: SortKey -> String # showList :: [SortKey] -> ShowS #
Eq SortKey Source #
Instance details Defined in Text.Collate.Collator Methods (==) :: SortKey -> SortKey -> Bool # (/=) :: SortKey -> SortKey -> Bool #
Ord SortKey Source #
Instance details Defined in Text.Collate.Collator Methods compare :: SortKey -> SortKey -> Ordering # (<) :: SortKey -> SortKey -> Bool # (<=) :: SortKey -> SortKey -> Bool # (>) :: SortKey -> SortKey -> Bool # (>=) :: SortKey -> SortKey -> Bool # max :: SortKey -> SortKey -> SortKey # min :: SortKey -> SortKey -> SortKey #

sortKey :: Collator -> Text -> SortKey Source #

The sort key used to compare a Text

renderSortKey :: SortKey -> String Source #

Render sort key in the manner used in the CLDR collation test data: the character '|' is used to separate the levels of the key and corresponds to a 0 in the actual sort key.

data VariableWeighting Source #

VariableWeighting affects how punctuation is treated. See http://www.unicode.org/reports/tr10/#Variable_Weighting.

Constructors

NonIgnorable	Don't ignore punctuation (Deluge < deluge-)
Blanked	Completely ignore punctuation (Deluge = deluge-)
Shifted	Consider punctuation at lower priority (de-luge < delu-ge < deluge < deluge- < Deluge)
ShiftTrimmed	Variant of Shifted (deluge < de-luge < delu-ge)

Instances

Instances details

Show VariableWeighting Source #
Instance details Defined in Text.Collate.Collator Methods showsPrec :: Int -> VariableWeighting -> ShowS # show :: VariableWeighting -> String # showList :: [VariableWeighting] -> ShowS #
Eq VariableWeighting Source #
Instance details Defined in Text.Collate.Collator Methods (==) :: VariableWeighting -> VariableWeighting -> Bool # (/=) :: VariableWeighting -> VariableWeighting -> Bool #
Ord VariableWeighting Source #
Instance details Defined in Text.Collate.Collator Methods compare :: VariableWeighting -> VariableWeighting -> Ordering # (<) :: VariableWeighting -> VariableWeighting -> Bool # (<=) :: VariableWeighting -> VariableWeighting -> Bool # (>) :: VariableWeighting -> VariableWeighting -> Bool # (>=) :: VariableWeighting -> VariableWeighting -> Bool # max :: VariableWeighting -> VariableWeighting -> VariableWeighting # min :: VariableWeighting -> VariableWeighting -> VariableWeighting #

data CollatorOptions Source #

Constructors

CollatorOptions

Fields

optLang :: Maybe Lang
Lang used for tailoring. Note that because of fallback rules, this may be somewhat different from the Lang passed to collatorFor. This Lang won't contain unicode extensions used to set options, but it will specify the collation if a non-default collation is being used.
optVariableWeighting :: VariableWeighting
Method for handling variable elements (see http://www.unicode.org/reports/tr10/, Tables 11 and 12).
optFrenchAccents :: Bool
If True, secondary weights are scanned in reverse order, so we get the sorting "cote côte coté côté" instead of "cote coté côte côté"
optUpperBeforeLower :: Bool
Sort uppercase letters before lower
optNormalize :: Bool
If True, strings are normalized to NFD before collation elements are constructed. If the input is already normalized, this option can be set to False for better performance.

Instances

Instances details

Show CollatorOptions Source #
Instance details Defined in Text.Collate.Collator Methods showsPrec :: Int -> CollatorOptions -> ShowS # show :: CollatorOptions -> String # showList :: [CollatorOptions] -> ShowS #
Eq CollatorOptions Source #
Instance details Defined in Text.Collate.Collator Methods (==) :: CollatorOptions -> CollatorOptions -> Bool # (/=) :: CollatorOptions -> CollatorOptions -> Bool #
Ord CollatorOptions Source #
Instance details Defined in Text.Collate.Collator Methods compare :: CollatorOptions -> CollatorOptions -> Ordering # (<) :: CollatorOptions -> CollatorOptions -> Bool # (<=) :: CollatorOptions -> CollatorOptions -> Bool # (>) :: CollatorOptions -> CollatorOptions -> Bool # (>=) :: CollatorOptions -> CollatorOptions -> Bool # max :: CollatorOptions -> CollatorOptions -> CollatorOptions # min :: CollatorOptions -> CollatorOptions -> CollatorOptions #

collatorOptions :: Collator -> CollatorOptions Source #

The options used for this Collator

collatorLang :: Collator -> Maybe Lang Source #

Deprecated: Use (optLang . collatorOptions)

Lang used for tailoring. Because of fallback rules, this may be somewhat different from the Lang passed to collatorFor. This Lang won't contain unicode extensions used to set options, but it will specify the collation if a non-default collation is being used.

setVariableWeighting :: VariableWeighting -> Collator -> Collator Source #

Set method for handling variable elements (punctuation and spaces): see http://www.unicode.org/reports/tr10/, Tables 11 and 12.

setNormalization :: Bool -> Collator -> Collator Source #

The Unicode Collation Algorithm expects input to be normalized into its canonical decomposition (NFD). By default, collators perform this normalization. If your input is already normalized, you can increase performance by disabling this step: setNormalization False.

setFrenchAccents :: Bool -> Collator -> Collator Source #

setFrenchAccents True causes secondary weights to be scanned in reverse order, so we get the sorting cote côte coté côté instead of cote coté côte côté. The default is usually False, except for fr-CA where it is True.

setUpperBeforeLower :: Bool -> Collator -> Collator Source #

Most collations default to sorting lowercase letters before uppercase (exceptions: mt, da, cu). To select the opposite behavior, use setUpperBeforeLower True.

tailorings :: [(Lang, Collation)] Source #

An association list matching Langs with tailored Collations.

data Lang Source #

Represents a BCP 47 language tag (https://tools.ietf.org/html/bcp47).

Constructors

Lang
Fields langLanguage :: Text langScript :: Maybe Text langRegion :: Maybe Text langVariants :: [Text] langExtensions :: [(Text, [(Text, Text)])] langPrivateUse :: [Text]

Instances

Instances details

IsString Lang Source #
Instance details Defined in Text.Collate.Lang Methods fromString :: String -> Lang #
Show Lang Source #
Instance details Defined in Text.Collate.Lang Methods showsPrec :: Int -> Lang -> ShowS # show :: Lang -> String # showList :: [Lang] -> ShowS #
Binary Lang Source #
Instance details Defined in Text.Collate.Lang Methods put :: Lang -> Put # get :: Get Lang # putList :: [Lang] -> Put #
Eq Lang Source #
Instance details Defined in Text.Collate.Lang Methods (==) :: Lang -> Lang -> Bool # (/=) :: Lang -> Lang -> Bool #
Ord Lang Source #
Instance details Defined in Text.Collate.Lang Methods compare :: Lang -> Lang -> Ordering # (<) :: Lang -> Lang -> Bool # (<=) :: Lang -> Lang -> Bool # (>) :: Lang -> Lang -> Bool # (>=) :: Lang -> Lang -> Bool # max :: Lang -> Lang -> Lang # min :: Lang -> Lang -> Lang #
Lift Lang Source #
Instance details Defined in Text.Collate.Lang Methods lift :: Quote m => Lang -> m Exp # liftTyped :: forall (m :: Type -> Type). Quote m => Lang -> Code m Lang #

lookupLang :: Lang -> [(Lang, a)] -> Maybe (Lang, a) Source #

Find best match for a Lang in an association list.

parseLang :: Text -> Either String Lang Source #

Parse a BCP 47 language tag as a Lang.

renderLang :: Lang -> Text Source #

Render a Lang in BCP 47 form.