{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}

-- | The module implements the tokenization used within Nerf
-- and some other tokenization-related stuff.

module NLP.Nerf.Tokenize
(
-- * Tokenization
  tokenize
-- * Synchronization
, Word
, moveNEs
) where

import Control.Monad ((>=>))
import Data.Foldable (foldMap)
import qualified Data.Char as Char
import qualified Data.List as L
import qualified Data.Tree as T
import qualified Data.Traversable as Tr
import qualified Data.Text as Text
import qualified Data.Text.Lazy as LazyText
import qualified NLP.Tokenize as Tok

import Data.Named.Tree (NeForest, NeTree, groupForestLeaves)

---------------------------
-- Tokenization definition.
---------------------------

-- | Default tokenizator.
defaultTokenizer :: Tok.Tokenizer
defaultTokenizer
    =   Tok.whitespace
    >=> Tok.uris
    >=> Tok.punctuation

-- | Tokenize sentence using the default tokenizer.
tokenize :: String -> [String]
tokenize = Tok.run defaultTokenizer

---------------------------------------------------------------
-- Synchronizing named entities with new sentence tokenization.
---------------------------------------------------------------

-- | A class of objects with size.
class Word a where
    size        :: a -> Int
    rmSpaces    :: a -> a

instance Word String where
    size = length
    rmSpaces = filter (not . Char.isSpace)

instance Word Text.Text where
    size = Text.length
    rmSpaces = Text.filter (not . Char.isSpace)

instance Word LazyText.Text where
    size = fromInteger . toInteger . LazyText.length
    rmSpaces = LazyText.filter (not . Char.isSpace)

essence :: Word a => a -> Int
essence = size . rmSpaces
{-# INLINE essence #-}

-- | Syncronization between two sentences.  Each (xs, ys) pair represents
-- tokens from the two input sentences which corresponds to each other.
type Sync a = [([a], [a])]

-- | Synchronize two tokenizations of the sentence.
sync :: Word a => [a] -> [a] -> Sync a
sync = sync' 0

sync' :: Word a => Int -> [a] -> [a] -> Sync a
sync' r (x:xs) (y:ys)
    | n + r == m    = ([x], [y])    : sync' 0       xs    ys
    | n + r  < m    = join x        $ sync' (n + r) xs (y:ys)
    | otherwise     = swap . join y $ sync' (m - r) ys (x:xs)
  where
    n = essence x
    m = essence y
    join l ((ls, rs) : ps)  = (l:ls, rs) : ps
    join _ []               = error "sync'.join: bad arguments"
    swap ((ls, rs) : ps)    = (rs, ls) : swap ps
    swap []                 = []
sync' 0 [] [] = []
sync' _ _  _  = error "sync': bad arguments"

-- | Match the `Sync` with the given list, return the matching result
-- (snd elements of the `Sync` list) and the rest of the `Sync` list.
match :: Word a => [a] -> Sync a -> ([a], Sync a)
match xs ss =
    let (sl, sr) = splitAcc isMatch 0 ss
    in  (concatMap snd sl, sr)
  where
    n = sum (map essence xs)
    isMatch r (ys, _)
        | m + r < n     = (m + r, False)
        | m + r == n    = (m + r, True)
        | otherwise     = error "match.isMatch: no match"
      where
        m = sum (map essence ys)

-- | Split the list with the help of the accumulating function.
splitAcc :: (acc -> a -> (acc, Bool)) -> acc -> [a] -> ([a], [a])
splitAcc _ _ [] = ([], [])
splitAcc f acc (x:xs)
    | cond      = ([x], xs)
    | otherwise = join x (splitAcc f acc' xs)
  where
    (acc', cond) = f acc x
    join y (ys, zs) = (y:ys, zs)

-- | List forest leaves.
leaves :: NeForest a b -> [b]
leaves = concatMap $ foldMap (either (const []) (:[]))

unGroupLeaves :: NeForest a [b] -> NeForest a b
unGroupLeaves = concatMap unGroupLeavesT

unGroupLeavesT :: NeTree a [b] -> [NeTree a b]
unGroupLeavesT (T.Node (Left v) xs)     =
    [T.Node (Left v) (unGroupLeaves xs)]
unGroupLeavesT (T.Node (Right vs) _)   =
    [T.Node (Right v) [] | v <- vs]

substGroups :: Word b => NeForest a [b] -> Sync b -> NeForest a [b]
substGroups fs ss = snd $ L.mapAccumL substGroupsT ss fs

substGroupsT :: Word b => Sync b -> NeTree a [b] -> (Sync b, NeTree a [b])
substGroupsT =
    Tr.mapAccumL f
  where
    f s (Left v)  = (s, Left v)
    f s (Right v) =
        let (v', s') = match v s
        in  (s', Right v')

-- | Synchronize named entities with tokenization represented
-- by the second function argument.  Of course, both arguments
-- should relate to the same sentence.
moveNEs :: Word b => NeForest a b -> [b] -> NeForest a b
moveNEs ft ys
    = unGroupLeaves
    $ substGroups
        (groupForestLeaves true ft)
        (sync (leaves ft) ys)
  where
    true _ _ = True