{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE DeriveLift #-}
module Text.Collate.Collation
 ( Collation(..)
 , CollationElement(..)
 , unfoldCollation
 , insertElements
 , alterElements
 , suppressContractions
 , findLast
 , findFirst
 , matchLongestPrefix
 , getCollationElements
 , parseCollation
 , parseCJKOverrides
 )
where

import qualified Data.IntSet as IntSet
import qualified Data.Text as T
import qualified Data.Text.Read as TR
import Data.Text (Text)
import Data.Bits ( Bits((.|.), shiftR, (.&.)) )
import Data.List (foldl')
import Text.Collate.UnicodeData (readCodePoints)
import Data.Maybe
import Data.Foldable (minimumBy, maximumBy)
import Data.Word (Word16)
import Data.Binary (Binary(get, put))
import Language.Haskell.TH.Syntax (Lift(..))
import Instances.TH.Lift ()
import qualified Text.Collate.Trie as Trie
import Text.Collate.CanonicalCombiningClass (canonicalCombiningClass)
import Text.Printf
#if MIN_VERSION_base(4,11,0)
#else
import Data.Semigroup (Semigroup(..))
#endif
-- import Debug.Trace

data CollationElement =
  CollationElement
    { CollationElement -> Bool
collationVariable :: !Bool
    , CollationElement -> Word16
collationL1       :: {-# UNPACK #-} !Word16
    , CollationElement -> Word16
collationL2       :: {-# UNPACK #-} !Word16
    , CollationElement -> Word16
collationL3       :: {-# UNPACK #-} !Word16
    , CollationElement -> Word16
collationL4       :: {-# UNPACK #-} !Word16
    } deriving (CollationElement -> CollationElement -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CollationElement -> CollationElement -> Bool
$c/= :: CollationElement -> CollationElement -> Bool
== :: CollationElement -> CollationElement -> Bool
$c== :: CollationElement -> CollationElement -> Bool
Eq, forall t.
(forall (m :: * -> *). Quote m => t -> m Exp)
-> (forall (m :: * -> *). Quote m => t -> Code m t) -> Lift t
forall (m :: * -> *). Quote m => CollationElement -> m Exp
forall (m :: * -> *).
Quote m =>
CollationElement -> Code m CollationElement
liftTyped :: forall (m :: * -> *).
Quote m =>
CollationElement -> Code m CollationElement
$cliftTyped :: forall (m :: * -> *).
Quote m =>
CollationElement -> Code m CollationElement
lift :: forall (m :: * -> *). Quote m => CollationElement -> m Exp
$clift :: forall (m :: * -> *). Quote m => CollationElement -> m Exp
Lift)

instance Ord CollationElement where
 compare :: CollationElement -> CollationElement -> Ordering
compare (CollationElement Bool
_ Word16
p1 Word16
s1 Word16
t1 Word16
q1) (CollationElement Bool
_ Word16
p2 Word16
s2 Word16
t2 Word16
q2) =
   forall a. Ord a => a -> a -> Ordering
compare Word16
p1 Word16
p2 forall a. Semigroup a => a -> a -> a
<> forall a. Ord a => a -> a -> Ordering
compare Word16
s1 Word16
s2 forall a. Semigroup a => a -> a -> a
<> forall a. Ord a => a -> a -> Ordering
compare Word16
t1 Word16
t2 forall a. Semigroup a => a -> a -> a
<> forall a. Ord a => a -> a -> Ordering
compare Word16
q1 Word16
q2

instance Show CollationElement where
  show :: CollationElement -> String
show (CollationElement Bool
v Word16
l1 Word16
l2 Word16
l3 Word16
l4) =
    forall r. PrintfType r => String -> r
printf String
"CollationElement %s 0x%04X 0x%04X 0x%04X 0x%04X" (forall a. Show a => a -> String
show Bool
v) Word16
l1 Word16
l2 Word16
l3 Word16
l4

instance Binary CollationElement where
   put :: CollationElement -> Put
put (CollationElement Bool
v Word16
w Word16
x Word16
y Word16
z) = forall t. Binary t => t -> Put
put (Bool
v,Word16
w,Word16
x,Word16
y,Word16
z)
   get :: Get CollationElement
get = do
     (Bool
v,Word16
w,Word16
x,Word16
y,Word16
z) <- forall t. Binary t => Get t
get
     forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Bool -> Word16 -> Word16 -> Word16 -> Word16 -> CollationElement
CollationElement Bool
v Word16
w Word16
x Word16
y Word16
z

newtype Collation = Collation { Collation -> Trie [CollationElement]
unCollation :: Trie.Trie [CollationElement] }
  deriving (Int -> Collation -> ShowS
[Collation] -> ShowS
Collation -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Collation] -> ShowS
$cshowList :: [Collation] -> ShowS
show :: Collation -> String
$cshow :: Collation -> String
showsPrec :: Int -> Collation -> ShowS
$cshowsPrec :: Int -> Collation -> ShowS
Show, Collation -> Collation -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Collation -> Collation -> Bool
$c/= :: Collation -> Collation -> Bool
== :: Collation -> Collation -> Bool
$c== :: Collation -> Collation -> Bool
Eq, Eq Collation
Collation -> Collation -> Bool
Collation -> Collation -> Ordering
Collation -> Collation -> Collation
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Collation -> Collation -> Collation
$cmin :: Collation -> Collation -> Collation
max :: Collation -> Collation -> Collation
$cmax :: Collation -> Collation -> Collation
>= :: Collation -> Collation -> Bool
$c>= :: Collation -> Collation -> Bool
> :: Collation -> Collation -> Bool
$c> :: Collation -> Collation -> Bool
<= :: Collation -> Collation -> Bool
$c<= :: Collation -> Collation -> Bool
< :: Collation -> Collation -> Bool
$c< :: Collation -> Collation -> Bool
compare :: Collation -> Collation -> Ordering
$ccompare :: Collation -> Collation -> Ordering
Ord, forall t.
(forall (m :: * -> *). Quote m => t -> m Exp)
-> (forall (m :: * -> *). Quote m => t -> Code m t) -> Lift t
forall (m :: * -> *). Quote m => Collation -> m Exp
forall (m :: * -> *). Quote m => Collation -> Code m Collation
liftTyped :: forall (m :: * -> *). Quote m => Collation -> Code m Collation
$cliftTyped :: forall (m :: * -> *). Quote m => Collation -> Code m Collation
lift :: forall (m :: * -> *). Quote m => Collation -> m Exp
$clift :: forall (m :: * -> *). Quote m => Collation -> m Exp
Lift, NonEmpty Collation -> Collation
Collation -> Collation -> Collation
forall b. Integral b => b -> Collation -> Collation
forall a.
(a -> a -> a)
-> (NonEmpty a -> a)
-> (forall b. Integral b => b -> a -> a)
-> Semigroup a
stimes :: forall b. Integral b => b -> Collation -> Collation
$cstimes :: forall b. Integral b => b -> Collation -> Collation
sconcat :: NonEmpty Collation -> Collation
$csconcat :: NonEmpty Collation -> Collation
<> :: Collation -> Collation -> Collation
$c<> :: Collation -> Collation -> Collation
Semigroup, Semigroup Collation
Collation
[Collation] -> Collation
Collation -> Collation -> Collation
forall a.
Semigroup a -> a -> (a -> a -> a) -> ([a] -> a) -> Monoid a
mconcat :: [Collation] -> Collation
$cmconcat :: [Collation] -> Collation
mappend :: Collation -> Collation -> Collation
$cmappend :: Collation -> Collation -> Collation
mempty :: Collation
$cmempty :: Collation
Monoid)

instance Binary Collation where
   put :: Collation -> Put
put (Collation Trie [CollationElement]
m) = forall t. Binary t => t -> Put
put Trie [CollationElement]
m
   get :: Get Collation
get = Trie [CollationElement] -> Collation
Collation forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall t. Binary t => Get t
get



-- | Unfold a 'Collation' into an association list.
unfoldCollation :: Collation -> [([Int], [CollationElement])]
unfoldCollation :: Collation -> [([Int], [CollationElement])]
unfoldCollation (Collation Trie [CollationElement]
trie) = forall a. Trie a -> [([Int], a)]
Trie.unfoldTrie Trie [CollationElement]
trie

-- | Insert collation elements for the given code points (if there is
-- more than one code point, it is a contraction).
insertElements :: [Int] -> [CollationElement] -> Collation -> Collation
insertElements :: [Int] -> [CollationElement] -> Collation -> Collation
insertElements [Int]
codepoints [CollationElement]
els (Collation Trie [CollationElement]
trie) =
  Trie [CollationElement] -> Collation
Collation forall a b. (a -> b) -> a -> b
$ forall a. [Int] -> a -> Trie a -> Trie a
Trie.insert [Int]
codepoints [CollationElement]
els Trie [CollationElement]
trie

-- | Suppress contracts starting with any of the code points in the list.
suppressContractions :: [Int] -> Collation -> Collation
suppressContractions :: [Int] -> Collation -> Collation
suppressContractions [Int]
cps Collation
coll =
  forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr ((Maybe [CollationElement] -> Maybe [CollationElement])
-> [Int] -> Collation -> Collation
alterElements (forall a b. a -> b -> a
const forall a. Maybe a
Nothing)) Collation
coll
    [[Int]
is | is :: [Int]
is@(Int
i:Int
_:[Int]
_) <- [[Int]]
collationKeys, Int
i forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Int]
cps]
 where
  collationKeys :: [[Int]]
collationKeys = forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> a
fst forall a b. (a -> b) -> a -> b
$ Collation -> [([Int], [CollationElement])]
unfoldCollation Collation
coll

-- | Change the collation elements defined for the specified code point(s).
alterElements :: (Maybe [CollationElement] -> Maybe [CollationElement])
              -> [Int] -> Collation -> Collation
alterElements :: (Maybe [CollationElement] -> Maybe [CollationElement])
-> [Int] -> Collation -> Collation
alterElements Maybe [CollationElement] -> Maybe [CollationElement]
f [Int]
codepoints (Collation Trie [CollationElement]
trie) =
  Trie [CollationElement] -> Collation
Collation forall a b. (a -> b) -> a -> b
$ forall a. (Maybe a -> Maybe a) -> [Int] -> Trie a -> Trie a
Trie.alter Maybe [CollationElement] -> Maybe [CollationElement]
f [Int]
codepoints Trie [CollationElement]
trie

{-# SPECIALIZE matchLongestPrefix
  :: Collation -> [Int] -> Maybe ([CollationElement], Int, Collation) #-}

-- | Find the longest matching prefix of a list of code points
-- in the collation table. This may be a single code point or
-- several (if contractions are defined).  Return the
-- collation elements for the matched code points, the code points
-- matched, and a "subcollation" which can be searched for further
-- matches. (This is needed because of "discontiguous matches";
-- see <http://www.unicode.org/reports/tr10/#Input_Matching>.)
matchLongestPrefix :: Foldable t
                   => Collation
                   -> t Int
                   -> Maybe ([CollationElement], Int, Collation)
matchLongestPrefix :: forall (t :: * -> *).
Foldable t =>
Collation -> t Int -> Maybe ([CollationElement], Int, Collation)
matchLongestPrefix (Collation Trie [CollationElement]
trie) t Int
codepoints =
  case forall (t :: * -> *) a.
Foldable t =>
Trie a -> t Int -> Maybe (a, Int, Trie a)
Trie.matchLongestPrefix Trie [CollationElement]
trie t Int
codepoints of
    Maybe ([CollationElement], Int, Trie [CollationElement])
Nothing -> forall a. Maybe a
Nothing
    Just ([CollationElement]
els, Int
consumed, Trie [CollationElement]
trie') -> forall a. a -> Maybe a
Just ([CollationElement]
els, Int
consumed, Trie [CollationElement] -> Collation
Collation Trie [CollationElement]
trie')

lookupNonEmptyChild :: Collation
                    -> Int
                    -> Maybe ([CollationElement], Collation)
lookupNonEmptyChild :: Collation -> Int -> Maybe ([CollationElement], Collation)
lookupNonEmptyChild (Collation Trie [CollationElement]
trie) Int
point =
  case forall a. Trie a -> Int -> Maybe (a, Trie a)
Trie.lookupNonEmptyChild Trie [CollationElement]
trie Int
point of
    Maybe ([CollationElement], Trie [CollationElement])
Nothing -> forall a. Maybe a
Nothing
    Just ([CollationElement]
els, Trie [CollationElement]
trie') -> forall a. a -> Maybe a
Just ([CollationElement]
els, Trie [CollationElement] -> Collation
Collation Trie [CollationElement]
trie')

-- | Find the first element in a 'Collation' that meets a condition.
-- Return the code points and the elements.
findFirst :: ([CollationElement] -> Bool)
          -> Collation
          -> Maybe ([Int], [CollationElement])
findFirst :: ([CollationElement] -> Bool)
-> Collation -> Maybe ([Int], [CollationElement])
findFirst [CollationElement] -> Bool
f (Collation Trie [CollationElement]
trie) =
  case forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
minimumBy forall {a} {a}.
(a, [CollationElement]) -> (a, [CollationElement]) -> Ordering
comp forall a b. (a -> b) -> a -> b
$ forall a. Trie a -> [([Int], a)]
Trie.unfoldTrie Trie [CollationElement]
trie of
    ([Int]
is,[CollationElement]
elts) | [CollationElement] -> Bool
f [CollationElement]
elts -> forall a. a -> Maybe a
Just ([Int]
is,[CollationElement]
elts)
    ([Int], [CollationElement])
_ -> forall a. Maybe a
Nothing
 where
  comp :: (a, [CollationElement]) -> (a, [CollationElement]) -> Ordering
comp (a
_,[CollationElement]
x) (a
_,[CollationElement]
y) =  -- note Left a < Right a
    forall a. Ord a => a -> a -> Ordering
compare (if [CollationElement] -> Bool
f [CollationElement]
x then forall a b. a -> Either a b
Left [CollationElement]
x else forall a b. b -> Either a b
Right [CollationElement]
x)
            (if [CollationElement] -> Bool
f [CollationElement]
y then forall a b. a -> Either a b
Left [CollationElement]
y else forall a b. b -> Either a b
Right [CollationElement]
y)

-- | Find the last element in a 'Collation' that meets a condition.
-- Return the code points and the elements.
findLast :: ([CollationElement] -> Bool)
         -> Collation
         -> Maybe ([Int], [CollationElement])
findLast :: ([CollationElement] -> Bool)
-> Collation -> Maybe ([Int], [CollationElement])
findLast [CollationElement] -> Bool
f (Collation Trie [CollationElement]
trie) =
  case forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy forall {a} {a}.
(a, [CollationElement]) -> (a, [CollationElement]) -> Ordering
comp forall a b. (a -> b) -> a -> b
$ forall a. Trie a -> [([Int], a)]
Trie.unfoldTrie Trie [CollationElement]
trie of
    ([Int]
is,[CollationElement]
elts) | [CollationElement] -> Bool
f [CollationElement]
elts -> forall a. a -> Maybe a
Just ([Int]
is,[CollationElement]
elts)
    ([Int], [CollationElement])
_ -> forall a. Maybe a
Nothing
 where
  comp :: (a, [CollationElement]) -> (a, [CollationElement]) -> Ordering
comp (a
_,[CollationElement]
x) (a
_,[CollationElement]
y) =  -- note Left a < Right a
    forall a. Ord a => a -> a -> Ordering
compare (if [CollationElement] -> Bool
f [CollationElement]
x then forall a b. b -> Either a b
Right [CollationElement]
x else forall a b. a -> Either a b
Left [CollationElement]
x)
            (if [CollationElement] -> Bool
f [CollationElement]
y then forall a b. b -> Either a b
Right [CollationElement]
y else forall a b. a -> Either a b
Left [CollationElement]
y)


-- S2.1 Find the longest initial substring S at each point that
-- has a match in the collation element table.
--
--     S2.1.1 If there are any non-starters following S, process each
--     non-starter C.
--
--     S2.1.2 If C is an unblocked non-starter with respect to S,
--     find if S + C has a match in the collation element table.
--
--     S2.1.3 If there is a match, replace S by S + C, and remove C.
--
-- Blocking Context: The presence of a character B between two characters
-- C1 and C2, where ccc(B) = 0 or ccc(B) ≥ ccc(C2).
--
-- Non-Starter: An assigned character with Canonical_Combining_Class ≠ 0.
--
-- Unblocked Non-Starter: A non-starter C2 which is not in a blocking
-- context with respect to a preceding character C1 in a string.
--
-- In the context <C1 ... B ... C2>, if there is no intervening
-- character B which meets the criterion for being a blocking context,
-- and if C2 is a non-starter, then it is also an unblocked non-starter.

-- | Retrieve the collation elements defined by a collation for
-- a sequence of code points.  These are used to construct a 'SortKey'.
getCollationElements :: Collation -> [Int] -> [CollationElement]
getCollationElements :: Collation -> [Int] -> [CollationElement]
getCollationElements Collation
collation = [Int] -> [CollationElement]
go
 where
  go :: [Int] -> [CollationElement]
go [] = []
  go (Int
c:[Int]
cs) =
    case forall (t :: * -> *).
Foldable t =>
Collation -> t Int -> Maybe ([CollationElement], Int, Collation)
matchLongestPrefix Collation
collation (Int
cforall a. a -> [a] -> [a]
:[Int]
cs) of
       Maybe ([CollationElement], Int, Collation)
Nothing -> Int -> [CollationElement]
calculateImplicitWeight Int
c forall a. [a] -> [a] -> [a]
++ [Int] -> [CollationElement]
go [Int]
cs
       Just ([CollationElement]
elts, Int
consumed, Collation
subcollation)
               -> [CollationElement]
elts' forall a. [a] -> [a] -> [a]
++ [Int] -> [CollationElement]
go ([Int]
unblockedNonStarters' forall a. [a] -> [a] -> [a]
++ [Int]
is')
          where
             getUnblockedNonStarters :: Int -> [Int] -> ([Int], [Int])
getUnblockedNonStarters Int
_ [] = ([], [])
             getUnblockedNonStarters Int
n (Int
x:[Int]
xs)
               = case Int -> Int
canonicalCombiningClass Int
x of
                   Int
ccc
                     | Int
ccc forall a. Ord a => a -> a -> Bool
> Int
n,
                       ([Int]
xs', [Int]
rest) <- Int -> [Int] -> ([Int], [Int])
getUnblockedNonStarters Int
ccc [Int]
xs
                       -> (Int
x forall a. a -> [a] -> [a]
: [Int]
xs', [Int]
rest)
                     | Bool
otherwise -> ([], Int
x forall a. a -> [a] -> [a]
: [Int]
xs)
             ([Int]
unblockedNonStarters, [Int]
is') = Int -> [Int] -> ([Int], [Int])
getUnblockedNonStarters Int
0
                                             (forall a. Int -> [a] -> [a]
drop Int
consumed (Int
cforall a. a -> [a] -> [a]
:[Int]
cs))
             ([CollationElement]
elts', [Int]
unblockedNonStarters') =
               [CollationElement]
-> [Int] -> Collation -> ([CollationElement], [Int])
extendMatch [CollationElement]
elts [Int]
unblockedNonStarters Collation
subcollation
             -- find the first unblocked non-starter that can extend
             -- the current match, also removing it from the code
             -- point list
             popExtender :: [Int] -> Collation -> Maybe ([CollationElement], [Int], Collation)
popExtender = forall {c}.
([Int] -> c)
-> [Int] -> Collation -> Maybe ([CollationElement], c, Collation)
popExtender' forall a. a -> a
id
             popExtender' :: ([Int] -> c)
-> [Int] -> Collation -> Maybe ([CollationElement], c, Collation)
popExtender' [Int] -> c
_ [] Collation
_ = forall a. Maybe a
Nothing
             popExtender' [Int] -> c
acc (Int
x:[Int]
xs) Collation
subc
               = case Collation -> Int -> Maybe ([CollationElement], Collation)
lookupNonEmptyChild Collation
subc Int
x of
                   Just ([CollationElement]
es', Collation
subc') -> forall a. a -> Maybe a
Just ([CollationElement]
es', [Int] -> c
acc [Int]
xs, Collation
subc')
                   Maybe ([CollationElement], Collation)
Nothing -> ([Int] -> c)
-> [Int] -> Collation -> Maybe ([CollationElement], c, Collation)
popExtender' ([Int] -> c
acc forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
x forall a. a -> [a] -> [a]
:)) [Int]
xs Collation
subc
             extendMatch :: [CollationElement]
-> [Int] -> Collation -> ([CollationElement], [Int])
extendMatch [CollationElement]
es [Int]
ubs Collation
subc = case [Int] -> Collation -> Maybe ([CollationElement], [Int], Collation)
popExtender [Int]
ubs Collation
subc of
               Just ([CollationElement]
es', [Int]
ubs', Collation
subc') -> [CollationElement]
-> [Int] -> Collation -> ([CollationElement], [Int])
extendMatch [CollationElement]
es' [Int]
ubs' Collation
subc'
               Maybe ([CollationElement], [Int], Collation)
Nothing -> ([CollationElement]
es, [Int]
ubs)

-- see 10.1.3, Implicit Weights
-- from allkeys.txt:
-- @implicitweights 17000..18AFF; FB00 # Tangut and Tangut Components
-- @implicitweights 18D00..18D8F; FB00 # Tangut Supplement
-- @implicitweights 1B170..1B2FF; FB01 # Nushu
-- @implicitweights 18B00..18CFF; FB02 # Khitan Small Script
calculateImplicitWeight :: Int -> [CollationElement]
calculateImplicitWeight :: Int -> [CollationElement]
calculateImplicitWeight Int
cp =
  [Bool -> Word16 -> Word16 -> Word16 -> Word16 -> CollationElement
CollationElement Bool
False (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
aaaa) Word16
0x0020 Word16
0x0002 Word16
0xFFFF,
   Bool -> Word16 -> Word16 -> Word16 -> Word16 -> CollationElement
CollationElement Bool
False (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
bbbb) Word16
0 Word16
0 Word16
0xFFFF]
 where
  range :: Int -> Int -> IntSet
range Int
x Int
y = [Int] -> IntSet
IntSet.fromList [Int
x..Int
y]
  singleton :: Int -> IntSet
singleton = Int -> IntSet
IntSet.singleton
  union :: IntSet -> IntSet -> IntSet
union = IntSet -> IntSet -> IntSet
IntSet.union
  -- from PropList.txt in unicode data:
  unifiedIdeographs :: IntSet
unifiedIdeographs =    Int -> Int -> IntSet
range Int
0x3400 Int
0x4DBF IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x4E00 Int
0x9FFC IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0xFA0E Int
0xFA0F IntSet -> IntSet -> IntSet
`union`
                         Int -> IntSet
singleton Int
0xFA11 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0xFA13 Int
0xFA14 IntSet -> IntSet -> IntSet
`union`
                         Int -> IntSet
singleton Int
0xFA1F IntSet -> IntSet -> IntSet
`union`
                         Int -> IntSet
singleton Int
0xFA21 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0xFA23 Int
0xFA24 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0xFA27 Int
0xFA29 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x20000 Int
0x2A6DD IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x2A700 Int
0x2B734 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x2B740 Int
0x2B81D IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x2B820 Int
0x2CEA1 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x2CEB0 Int
0x2EBE0 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x2CEB0 Int
0x2EBE0 IntSet -> IntSet -> IntSet
`union`
                         Int -> Int -> IntSet
range Int
0x30000 Int
0x3134A
  -- from Blocks.txt in unicode data:
  cjkCompatibilityIdeographs :: IntSet
cjkCompatibilityIdeographs = Int -> Int -> IntSet
range Int
0xF900 Int
0xFAFF
  cjkUnifiedIdeographs :: IntSet
cjkUnifiedIdeographs = Int -> Int -> IntSet
range Int
0x4E00 Int
0x9FFF
  (Int
aaaa, Int
bbbb) =
    case Int
cp of
    Int
_ | Int
cp forall a. Ord a => a -> a -> Bool
>= Int
0x17000 , Int
cp forall a. Ord a => a -> a -> Bool
<= Int
0x18AFF -- Tangut and Tangut Components
        -> (Int
0xFB00, (Int
cp forall a. Num a => a -> a -> a
- Int
0x17000) forall a. Bits a => a -> a -> a
.|. Int
0x8000)
      | Int
cp forall a. Ord a => a -> a -> Bool
>= Int
0x18D00 , Int
cp forall a. Ord a => a -> a -> Bool
<= Int
0x18D8F -- Tangut Supplement
        -> (Int
0xFB00, (Int
cp forall a. Num a => a -> a -> a
- Int
0x17000) forall a. Bits a => a -> a -> a
.|. Int
0x8000)
      | Int
cp forall a. Ord a => a -> a -> Bool
>= Int
0x1B170 , Int
cp forall a. Ord a => a -> a -> Bool
<= Int
0x1B2FF -- Nushu
        -> (Int
0xFB01, (Int
cp forall a. Num a => a -> a -> a
- Int
0x1B170) forall a. Bits a => a -> a -> a
.|. Int
0x8000)
      | Int
cp forall a. Ord a => a -> a -> Bool
>= Int
0x18B00 , Int
cp forall a. Ord a => a -> a -> Bool
<= Int
0x18CFF -- Khitan Small Script
        -> (Int
0xFB02, (Int
cp forall a. Num a => a -> a -> a
- Int
0x18B00) forall a. Bits a => a -> a -> a
.|. Int
0x8000)
      | Int
cp Int -> IntSet -> Bool
`IntSet.member` IntSet
unifiedIdeographs Bool -> Bool -> Bool
&&
        (Int
cp Int -> IntSet -> Bool
`IntSet.member` IntSet
cjkUnifiedIdeographs Bool -> Bool -> Bool
||
         Int
cp Int -> IntSet -> Bool
`IntSet.member` IntSet
cjkCompatibilityIdeographs)  -- Core Han
        -> (Int
0xFB40 forall a. Num a => a -> a -> a
+ (Int
cp forall a. Bits a => a -> Int -> a
`shiftR` Int
15), (Int
cp forall a. Bits a => a -> a -> a
.&. Int
0x7FFF) forall a. Bits a => a -> a -> a
.|. Int
0x8000)
      | Int
cp Int -> IntSet -> Bool
`IntSet.member` IntSet
unifiedIdeographs -- All Other Han Unified ?
        -> (Int
0xFB80 forall a. Num a => a -> a -> a
+ (Int
cp forall a. Bits a => a -> Int -> a
`shiftR` Int
15), (Int
cp forall a. Bits a => a -> a -> a
.&. Int
0x7FFF) forall a. Bits a => a -> a -> a
.|. Int
0x8000)
      | Bool
otherwise
        -> (Int
0xFBC0 forall a. Num a => a -> a -> a
+ (Int
cp forall a. Bits a => a -> Int -> a
`shiftR` Int
15), (Int
cp forall a. Bits a => a -> a -> a
.&. Int
0x7FFFF) forall a. Bits a => a -> a -> a
.|. Int
0x8000)

-- | Parse a 'Collation' from a Text in the format of @allkeys.txt@.
parseCollation :: Text -> Collation
parseCollation :: Text -> Collation
parseCollation = forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' Collation -> Text -> Collation
processLine forall a. Monoid a => a
mempty forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
T.lines
 where
  processLine :: Collation -> Text -> Collation
processLine Collation
trie Text
t =
    case Text -> ([Int], Text)
readCodePoints Text
t of
      ([],Text
_) -> Collation
trie
      (Int
c:[Int]
cs, Text
rest) -> [Int] -> [CollationElement] -> Collation -> Collation
insertElements (Int
cforall a. a -> [a] -> [a]
:[Int]
cs) (Text -> [CollationElement]
go Text
rest) Collation
trie
  go :: Text -> [CollationElement]
go Text
t =
    case (Char -> Bool) -> Text -> (Text, Text)
T.break (forall a. Eq a => a -> a -> Bool
== Char
']') (Int -> Text -> Text
T.drop Int
1 forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Text -> Text
T.dropWhile (forall a. Eq a => a -> a -> Bool
/= Char
'[') Text
t) of
      (Text
contents, Text
rest)
         | Text -> Bool
T.null Text
rest -> []
         | Bool
otherwise   -> Text -> CollationElement
parseContents Text
contents forall a. a -> [a] -> [a]
: Text -> [CollationElement]
go Text
rest
  parseContents :: Text -> CollationElement
parseContents Text
t =
    let isVariable :: Bool
isVariable = Bool -> Bool
not (Text -> Bool
T.null Text
t) Bool -> Bool -> Bool
&& Text -> Char
T.head Text
t forall a. Eq a => a -> a -> Bool
== Char
'*'
        isIgnorable :: (a, a, a) -> Bool
isIgnorable (a
0,a
0,a
0) = Bool
True
        isIgnorable (a, a, a)
_       = Bool
False
    in case forall a b. (a -> b) -> [a] -> [b]
map forall a. Integral a => Reader a
TR.hexadecimal forall a b. (a -> b) -> a -> b
$ forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null)
                                  ((Char -> Bool) -> Text -> [Text]
T.split Char -> Bool
isSep Text
t) of
              [Right (Word16
x,Text
_), Right (Word16
y,Text
_), Right (Word16
z,Text
_)]
                -> Bool -> Word16 -> Word16 -> Word16 -> Word16 -> CollationElement
CollationElement Bool
isVariable Word16
x Word16
y Word16
z
                                    (if Bool
isVariable Bool -> Bool -> Bool
|| forall {a} {a} {a}.
(Eq a, Eq a, Eq a, Num a, Num a, Num a) =>
(a, a, a) -> Bool
isIgnorable (Word16
x,Word16
y,Word16
z)
                                        then Word16
0
                                        else Word16
0xFFFF)
              [Either String (Word16, Text)]
_ -> Bool -> Word16 -> Word16 -> Word16 -> Word16 -> CollationElement
CollationElement Bool
isVariable Word16
0 Word16
0 Word16
0 Word16
0
  isSep :: Char -> Bool
isSep Char
'*' = Bool
True
  isSep Char
'.' = Bool
True
  isSep Char
_   = Bool
False

-- the result is a list of code points; the first will be assigned
-- the colllation element [0x8000, 0x0020, 0x0002], the next
-- [0x8001, 0x0020, 0x0002], and so on.
parseCJKOverrides :: Text -> [Int]
parseCJKOverrides :: Text -> [Int]
parseCJKOverrides = forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe forall {a}. Integral a => Text -> Maybe a
chunkToCp forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
T.words
 where
  chunkToCp :: Text -> Maybe a
chunkToCp Text
t =
    case forall a. Integral a => Reader a
TR.hexadecimal Text
t of
      Right (a
x,Text
rest)
        | Text -> Bool
T.null Text
rest -> forall a. a -> Maybe a
Just a
x
      Either String (a, Text)
_ -> forall a. Maybe a
Nothing -- like the perl module we ignore e.g. FDD0-0041