{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings #-}

-- | We define our own normalization function instead of depending on
-- unicode-transforms, because we need a lazy (streaming) normalization
-- function for maximum efficiency.  No point normalizing two whole 'Text's
-- if we can see from the first few characters how they should be ordered.
-- See <https://unicode.org/reports/tr15/> for a description of the algorithm
-- implemented here.
module Text.Collate.Normalize
  ( toNFD
  )
where
import qualified Data.IntMap as M
import Text.Collate.UnicodeData (genCanonicalDecompositionMap)
import Text.Collate.CanonicalCombiningClass (canonicalCombiningClass)
import Data.List (sortOn)

canonicalDecompositionMap :: M.IntMap [Int]
canonicalDecompositionMap :: IntMap [Int]
canonicalDecompositionMap = $(Int
[Int]
[(Int, [Int])]
[(Int, [Int])] -> IntMap [Int]
forall a. [(Int, a)] -> IntMap a
genCanonicalDecompositionMap)

-- | Lazily normalize a list of code points to its canonical decomposition (NFD).
toNFD :: [Int] -> [Int]
toNFD :: [Int] -> [Int]
toNFD = [Int] -> [Int]
rearrangeCombiningMarks ([Int] -> [Int]) -> ([Int] -> [Int]) -> [Int] -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> [Int]
recursivelyDecompose

rearrangeCombiningMarks :: [Int] -> [Int]
rearrangeCombiningMarks :: [Int] -> [Int]
rearrangeCombiningMarks = [Int] -> [Int]
go
 where
  go :: [Int] -> [Int]
go [] = []
  go (Int
c:[Int]
cs) =
    if Int -> Int
canonicalCombiningClass Int
c Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
       then
         Int
c Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: case [Int] -> ([Int], [Int])
reorderMarks [Int]
cs of
               ([], [Int]
rest)    -> [Int] -> [Int]
go [Int]
rest
               ([Int]
marks, [Int]
rest) -> (Int -> [Int] -> [Int]) -> [Int] -> [Int] -> [Int]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (:) ([Int] -> [Int]
go [Int]
rest) [Int]
marks
       else
         case [Int] -> ([Int], [Int])
reorderMarks (Int
cInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:[Int]
cs) of
               ([], [Int]
rest)    -> [Int] -> [Int]
go [Int]
rest
               ([Int]
marks, [Int]
rest) -> (Int -> [Int] -> [Int]) -> [Int] -> [Int] -> [Int]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (:) ([Int] -> [Int]
go [Int]
rest) [Int]
marks
  reorderMarks :: [Int] -> ([Int], [Int])
reorderMarks [Int]
zs =
    case (Int -> Bool) -> [Int] -> ([Int], [Int])
forall a. (a -> Bool) -> [a] -> ([a], [a])
break (\Int
z -> Int -> Int
canonicalCombiningClass Int
z Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0) [Int]
zs of
      ([], [Int]
ys) -> ([], [Int]
ys)
      ([Int]
xs, [Int]
ys) -> ((Int -> Int) -> [Int] -> [Int]
forall b a. Ord b => (a -> b) -> [a] -> [a]
sortOn Int -> Int
canonicalCombiningClass [Int]
xs, [Int]
ys)

recursivelyDecompose :: [Int] -> [Int]
recursivelyDecompose :: [Int] -> [Int]
recursivelyDecompose = (Int -> [Int] -> [Int]) -> [Int] -> [Int] -> [Int]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Int -> [Int] -> [Int]
go [Int]
forall a. Monoid a => a
mempty
  where go :: Int -> [Int] -> [Int]
go Int
c
          | Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xc0 = (Int
c Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:)
          | Int -> Bool
isHangulSyllable Int
c = Int -> [Int] -> [Int]
decomposeHangulSyllable Int
c
          | Bool
otherwise =
              case Int -> IntMap [Int] -> Maybe [Int]
forall a. Int -> IntMap a -> Maybe a
M.lookup Int
c IntMap [Int]
canonicalDecompositionMap of
                Maybe [Int]
Nothing -> (Int
c Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:)
                Just [Int]
ds -> (\[Int]
xs -> (Int -> [Int] -> [Int]) -> [Int] -> [Int] -> [Int]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Int -> [Int] -> [Int]
go [Int]
xs [Int]
ds)

-- | Hangul syllable range is AC00 - D7A3.
isHangulSyllable :: Int -> Bool
isHangulSyllable :: Int -> Bool
isHangulSyllable Int
cp = Int
cp Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0xAC00 Bool -> Bool -> Bool
&& Int
cp Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xD7A3

-- Hangul decomposition is algorithmic; see "Hangul Syllable Decomposition" in
-- the Unicode spec, which gives this algorithm:
--
-- SBase = AC0016
-- LBase = 110016
-- VBase = 116116
-- TBase = 11A716
-- LCount = 19
-- VCount = 21
-- TCount = 28
-- NCount = 588 (VCount * TCount) SCount = 11172 (LCount * NCount)
-- SIndex = s - SBase
-- LIndex = SIndex div NCount
-- VIndex = (SIndex mod NCount) div TCount TIndex = SIndex mod TCount
-- LPart = LBase + LIndex
-- VPart = VBase + VIndex
-- TPart = TBase + TIndex if TIndex > 0
-- If TIndex = 0, then there is no trailing consonant, so map the precomposed
-- Hangul syllable s to its full decomposition d = <LPart, VPart>. Otherwise,
-- there is a trailing consonant, so map s to its full decomposition d = <LPart,
-- VPart, TPart>.

decomposeHangulSyllable :: Int -> ([Int] -> [Int])
decomposeHangulSyllable :: Int -> [Int] -> [Int]
decomposeHangulSyllable !Int
c =
  if Int
sindex Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0 Bool -> Bool -> Bool
|| Int
sindex Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
scount
     then (Int
cInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:)
     else
       let l :: Int
l = Int
lbase Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
sindex Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
ncount)
           v :: Int
v = Int
vbase Int -> Int -> Int
forall a. Num a => a -> a -> a
+ ((Int
sindex Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
ncount) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
tcount)
           t :: Int
t = Int
tbase Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
sindex Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
tcount)
        in if Int
t Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Int
tbase
              then (Int
lInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:) ([Int] -> [Int]) -> ([Int] -> [Int]) -> [Int] -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
vInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:) ([Int] -> [Int]) -> ([Int] -> [Int]) -> [Int] -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
tInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:)
              else (Int
lInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:) ([Int] -> [Int]) -> ([Int] -> [Int]) -> [Int] -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
vInt -> [Int] -> [Int]
forall a. a -> [a] -> [a]
:)
 where
  !sindex :: Int
sindex = Int
c Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
sbase
  !sbase :: Int
sbase = Int
0xAC00
  !lbase :: Int
lbase = Int
0x1100
  !vbase :: Int
vbase = Int
0x1161
  !tbase :: Int
tbase = Int
0x11A7
  !tcount :: Int
tcount = Int
28
  !ncount :: Int
ncount = Int
588 -- vcount * tcount
  !scount :: Int
scount = Int
11172 -- lcount * ncount
  -- !lcount = 19
  -- !vcount = 21