{- | Copyright: (c) 2021 John MacFarlane SPDX-License-Identifier: BSD-2-Clause Maintainer: John MacFarlane <jgm@berkeley.edu> This library provides a pure Haskell implementation of the <https://www.unicode.org/reports/tr10 Unicode Collation Algorithm>, allowing proper sorting of Unicode strings. The simplest way to use the library is to use the 'IsString' instance of 'Collator' (together with the @OverloadedStrings@ extension): >>> import Data.List (sortBy) >>> import qualified Data.Text.IO as T >>> mapM_ T.putStrLn $ sortBy (collate "en-US") ["𝒶bc","abC","𝕒bc","Abc","abç","äbc"] abC 𝒶bc 𝕒bc Abc abç äbc Note the difference from the default sort: >>> import Data.List (sort) >>> import qualified Data.Text.IO as T >>> mapM_ T.putStrLn $ sort ["𝒶bc","abC","𝕒bc","Abc","abç","äbc"] Abc abC abç äbc 𝒶bc 𝕒bc A 'Collator' provides a function 'collate' that compares two texts, and a function 'sortKey' that returns the sort key. Most users will just need 'collate'. >>> let de = collatorFor "de" >>> let se = collatorFor "se" >>> collate de "ö" "z" LT >>> collate se "ö" "z" GT >>> sortKey de "ö" SortKey [0x213C,0x0000,0x0020,0x002B,0x0000,0x0002,0x0002] >>> sortKey se "ö" SortKey [0x22FD,0x0000,0x0020,0x0000,0x0002] To sort a string type other than 'Text', the function 'collateWithUnpacker' may be used. It takes as a parameter a function that lazily unpacks the string type into a list of 'Char'. >>> let seCollateString = collateWithUnpacker "se" id >>> seCollateString ("ö" :: String) ("z" :: String) GT Because 'Collator' and 'Lang' have 'IsString' instances, you can just specify them using string literals, as in the above examples. Note, however, that you won't get any feedback if the string doesn't parse correctly as a BCP47 language tag, or if no collation is defined for the specified language; instead, you'll just get the default (root) collator. For this reason, we don't recommend relying on the 'IsString' instance. If you won't know the language until run time, use 'parseLang' to parse it to a 'Lang', handling parse errors, and then pass the 'Lang' to 'collatorFor'. >>> let handleParseError = error -- or something fancier >>> lang <- either handleParseError return $ parseLang "bs-Cyrl" >>> collate (collatorFor lang) "a" "b" LT If you know the language at compile-time, use the 'collator' quasi-quoter and you'll get compile-time errors and warnings: >>> :set -XQuasiQuotes >>> let esTraditional = [collator|es-u-co-trad|] >>> let esStandard = [collator|es|] >>> collate esStandard "Co" "Ch" GT >>> collate esTraditional "Co" "Ch" LT Note that the unicode extension syntax for BCP47 can be used to specify a particular collation for the language (here, Spanish "traditional" instead of the default ordering; the alias `trad` is used because of length limits for BCP47 keywords). The extension syntax can also be used to set collator options. The keyword @kb@ can be used to specify the "backwards" accent sorting that is sometimes used in French: >>> collate "fr" "côte" "coté" GT >>> collate "fr-u-kb" "côte" "coté" LT The keyword @ka@ can be used to specify the variable weighting options which affect how punctuation and whitespace are treated: >>> collate "en-u-ka-shifted" "de-luge" "de Luge" LT >>> collate "en-u-ka-noignore" "de-luge" "de Luge" GT The keyword @kk@ can be used to turn off the normalization step (which is required by the algorithm but can be omitted for better performance if the input is already in NFD form (canonical decomposition). >>> let noNormalizeCollator = [collator|en-u-kk-false|] The keyword @kf@ can be used to say whether uppercase or lowercase letters should be sorted first. >>> collate "en-u-kf-upper" "A" "a" LT >>> collate "en-u-kf-lower" "A" "a" GT These options be combined: >>> collate "de-DE-u-co-phonebk-kb-false-ka-shifted" "Udet" "Über" LT Options can also be set using the functions 'setVariableWeighting', 'setNormalization', 'setUpperBeforeLower', and 'setFrenchAccents': >>> let frC = setFrenchAccents True [collator|fr|] >>> collate frC "côte" "coté" LT -} module Text.Collate ( Collator , collate , collateWithUnpacker , collatorFor , collator , rootCollator , SortKey(..) , sortKey , renderSortKey , VariableWeighting(..) , CollatorOptions(..) , collatorOptions , collatorLang , setVariableWeighting , setNormalization , setFrenchAccents , setUpperBeforeLower , tailorings , module Text.Collate.Lang ) where import Text.Collate.Lang ( lookupLang, parseLang, Lang(..), renderLang ) import Text.Collate.Collator ( collatorFor, collator, setNormalization, setUpperBeforeLower, setFrenchAccents, setVariableWeighting, rootCollator, Collator(collate, sortKey, collateWithUnpacker, collatorOptions), SortKey(..), CollatorOptions(..), collatorLang, VariableWeighting(..), renderSortKey ) import Text.Collate.Tailorings ( tailorings ) -- $setup -- >>> :set -XQuasiQuotes -- >>> :set -XOverloadedStrings