module NLP.Nerf.Tokenize
(
tokenize
, Word
, moveNEs
) where
import Control.Monad ((>=>))
import Data.Foldable (foldMap)
import qualified Data.Char as Char
import qualified Data.List as L
import qualified Data.Tree as T
import qualified Data.Traversable as Tr
import qualified Data.Text as Text
import qualified Data.Text.Lazy as LazyText
import qualified NLP.Tokenize as Tok
import Data.Named.Tree (NeForest, NeTree, groupForestLeaves)
defaultTokenizer :: Tok.Tokenizer
defaultTokenizer
= Tok.whitespace
>=> Tok.uris
>=> Tok.punctuation
tokenize :: String -> [String]
tokenize = Tok.run defaultTokenizer
class Word a where
size :: a -> Int
rmSpaces :: a -> a
instance Word String where
size = length
rmSpaces = filter (not . Char.isSpace)
instance Word Text.Text where
size = Text.length
rmSpaces = Text.filter (not . Char.isSpace)
instance Word LazyText.Text where
size = fromInteger . toInteger . LazyText.length
rmSpaces = LazyText.filter (not . Char.isSpace)
essence :: Word a => a -> Int
essence = size . rmSpaces
type Sync a = [([a], [a])]
sync :: Word a => [a] -> [a] -> Sync a
sync = sync' 0
sync' :: Word a => Int -> [a] -> [a] -> Sync a
sync' r (x:xs) (y:ys)
| n + r == m = ([x], [y]) : sync' 0 xs ys
| n + r < m = join x $ sync' (n + r) xs (y:ys)
| otherwise = swap . join y $ sync' (m r) ys (x:xs)
where
n = essence x
m = essence y
join l ((ls, rs) : ps) = (l:ls, rs) : ps
join _ [] = error "sync'.join: bad arguments"
swap ((ls, rs) : ps) = (rs, ls) : swap ps
swap [] = []
sync' 0 [] [] = []
sync' _ _ _ = error "sync': bad arguments"
match :: Word a => [a] -> Sync a -> ([a], Sync a)
match xs ss =
let (sl, sr) = splitAcc isMatch 0 ss
in (concatMap snd sl, sr)
where
n = sum (map essence xs)
isMatch r (ys, _)
| m + r < n = (m + r, False)
| m + r == n = (m + r, True)
| otherwise = error "match.isMatch: no match"
where
m = sum (map essence ys)
splitAcc :: (acc -> a -> (acc, Bool)) -> acc -> [a] -> ([a], [a])
splitAcc _ _ [] = ([], [])
splitAcc f acc (x:xs)
| cond = ([x], xs)
| otherwise = join x (splitAcc f acc' xs)
where
(acc', cond) = f acc x
join y (ys, zs) = (y:ys, zs)
leaves :: NeForest a b -> [b]
leaves = concatMap $ foldMap (either (const []) (:[]))
unGroupLeaves :: NeForest a [b] -> NeForest a b
unGroupLeaves = concatMap unGroupLeavesT
unGroupLeavesT :: NeTree a [b] -> [NeTree a b]
unGroupLeavesT (T.Node (Left v) xs) =
[T.Node (Left v) (unGroupLeaves xs)]
unGroupLeavesT (T.Node (Right vs) _) =
[T.Node (Right v) [] | v <- vs]
substGroups :: Word b => NeForest a [b] -> Sync b -> NeForest a [b]
substGroups fs ss = snd $ L.mapAccumL substGroupsT ss fs
substGroupsT :: Word b => Sync b -> NeTree a [b] -> (Sync b, NeTree a [b])
substGroupsT =
Tr.mapAccumL f
where
f s (Left v) = (s, Left v)
f s (Right v) =
let (v', s') = match v s
in (s', Right v')
moveNEs :: Word b => NeForest a b -> [b] -> NeForest a b
moveNEs ft ys
= unGroupLeaves
$ substGroups
(groupForestLeaves true ft)
(sync (leaves ft) ys)
where
true _ _ = True