> module Kulitta.Learning.CykParser where
> import Data.List
> import Kulitta.Learning.Parser

CYK Implementation
Donya Quick
Last modified: 22-Jan-2016



type Rule a = (a, [a])

> findProducers :: (Eq a) => [Rule a] -> [a] -> [Rule a]
> findProducers rs str = filter (\(l,r) -> r==str) rs

> cat :: [a] -> [a] -> [[a]]
> cat xs ys = [[x,y] | x<-xs, y<-ys]


> nextRow :: (Eq a) => [Rule a] -> [[[a]]] -> [[a]]
> nextRow rs rows =
>     let n = length rows + 1 -- we assume terminal level is not included

>         segs = map (\i -> (i,n-i)) [1..n-1]
>         strs offset (i,j) = cat
>             (rows !! (i-1) !! offset)
>             (rows !! (j-1) !! (offset + i))
>         rules offset (i,j) = concatMap (map fst. findProducers rs)
>                              (strs offset (i,j))
>         f offset = nub $ concatMap (rules offset) segs
>     in  map f [0..length (rows !! 0) - n]


> mkSegs' :: Int -> Int -> [[Int]]
> mkSegs' n m = filter (\s -> sum s == n) $
>     makeRange $ take m $ repeat (0,n) where
>     makeRange = foldr (\(l,u) xs -> [(a:b) | a<-[l..u], b<-xs]) [[]]

> mkSegs :: Int -> Int -> [[Int]]
> mkSegs n m = filter (\s -> length s > 1) $ map (filter (>0)) $ mkSegs' n m

mkSegs must be called with:
n = the maximum length of the string in question.
m = the maximum possible number of substrings

n needs to be controlled by both the level (row+1) and
the offset. The formula should be:

n = min (length rows) (length (rows !! 0) - offset)

> toInds :: Int -> [Int] -> [(Int, Int)]
> toInds offset [] = []
> toInds offset (l:ls) =
>     let row = l - 1
>         col = offset
>     in  if l <= 0 then toInds offset ls
>         else (row, col) : toInds (offset+l) ls

> toStrs :: [[[a]]] -> [(Int, Int)] -> [[a]]
> toStrs rows [] = [[]]
> toStrs rows ((i,j):cs) =
>     let strs = toStrs rows cs
>         theCell = if i < length rows && j < length (rows !! i)
>                  then rows !! i !! j
>                  else error ("(toStr) Bad box: ("++show i++", "++show j++")")
>     in  [(x:y) | x<-theCell, y<-strs]

> nextRowM m rs rows = -- m is the # of subdivisions

>     let n = length rows +1 -- the "level"

>         m' = min n m -- the number of substrings (or nonterms)

>         segs = mkSegs n m' -- this is ok

>         offsets = [0..length (rows !! 0) - n] -- all offsets

>         nts o s = concatMap (map fst . findProducers rs) (toStrs rows (toInds o s))
>         f o = concatMap (nts o) segs
>     in  map f offsets

> nextRowM2 m rs rows = -- m is the # of subdivisions

>     let n = length rows +1 -- the "level"

>         m' = min n m -- the number of substrings (or nonterms)

>         segs = mkSegs n m' -- this is ok

>         offsets = [0..length (rows !! 0) - n] -- all offsets

>         nts o s = concatMap (map fst . findProducers rs) (toStrs rows (toInds o s))
>         f o = concatMap (nts o) segs
>     in  map f offsets

> allRowsMS :: (Eq a) => Int -> [Rule a] -> [a] -> [[[a]]]
> allRowsMS m rs str = allRows' rs [fixRow rs $ firstRow' rs str] where
>     allRows' rs rows = if length rows == length (head rows) then rows
>                        else allRows' rs (rows ++ [fixRow rs $ nextRowM m rs rows])

> allRows :: (Eq a) => [Rule a] -> [a] -> [[[a]]]
> allRows rs str = allRows' rs [firstRow rs str] where
>     allRows' rs rows = if length rows == length (head rows) then rows
>                        else allRows' rs (rows ++ [nextRow rs rows])


> showRows :: (Show a) => [[[a]]] -> String
> showRows rs =
>     let f line = concatMap g line ++ "\n"
>         g bucket = show bucket ++ "\t"
>     in  concatMap f (reverse rs)

> printRows :: (Show a) => [[[a]]] -> IO ()
> printRows = putStr . showRows

==============

SYNONYM EXTENION

> findSynonyms :: (Eq a) => [Rule a] -> a -> [a]
> findSynonyms rules x = map fst $ filter (\(l,r) -> r==[x]) rules

> findSynRec :: (Eq a) => [Rule a] -> [a] -> [a]
> findSynRec rules syns =
>     let s = nub (syns ++ concatMap (findSynonyms rules) syns)
>     in  if s == syns then syns else findSynRec rules s


 findSynRec :: (Eq a) => [Rule a] -> a -> [a]
 findSynRec rules x =
     let s = findSynonyms rules x
         s' = nub (s ++ concatMap (findSynonyms rules) s)
     in  if s == s' then s else nub $ s ++ concatMap (findSynRec rules) s'

> fixSyns :: (Eq a) => [Rule a] -> [a] -> [a]
> fixSyns rules bucket = nub (bucket ++ concatMap (findSynonyms rules) bucket)

> fixRow :: (Eq a) => [Rule a] -> [[a]] -> [[a]]
> fixRow rules row = map (findSynRec rules) row

> allRowsS :: (Eq a) => [Rule a] -> [a] -> [[[a]]]
> allRowsS rs str = allRows' rs [fixRow rs $ firstRow rs str] where
>     allRows' rs rows = if length rows == length (head rows) then rows
>                        else allRows' rs (rows ++ [fixRow rs $ nextRow rs rows])

> firstRowOld rs cs = map (nub . map fst . findProducers rs . \a -> [a]) cs

> firstRow rs cs = map (\c -> [c]) cs

> firstRow' rs [] = []
> firstRow' rs (c:cs) =
>     let fr0 = nub $ c : (map fst $ findProducers rs [c])
>     in  (if null fr0 then [c] else fr0) : firstRow' rs cs



==============


GENERATING ALL PARSES

 allParses :: [Rule a] -> [[[a]]] -> Int -> Int [[Rule a]]
 allParses rules rows i j = -- i is the row, j is the column

     let f :: a -> [Rule a]
         f x = filter (\(l,r) -> l==x) rules
     in  undefined

mkSegs :: Int -> Int -> [[Int]]
mkSegs n m = filter (\s -> length s > 1) $ map (filter (>0)) $ mkSegs' n m

mkSegs must be called with:
n = the maximum length of the string in question.
m = the maximum possible number of substrings

Given a rule that we know can be applied, pick the cells it generates.

> getCells :: (Eq a) => [[[a]]] -> Rule a -> Int -> Int -> [[(Int, Int)]]
> getCells rows (lhs, rhs) level offset =
>     let n = level + 1
>         m = length rhs -- number of syms to genererate

>         segs = mkSegs n m -- get all possible ways to chunk the string

>         inds = map (toInds offset) segs -- :: [[(Int, Int]] turn these into cells

>         -- need to filter the cells now! --

>     in  filter (goodCells rows rhs) inds

> goodCells :: (Eq a) => [[[a]]] -> [a] -> [(Int, Int)] -> Bool
> goodCells rows [] [] = True
> goodCells rows (x:xs) ((i,j):is) =
>     elem x (rows !! i !! j) && goodCells rows xs is

> appendTo :: (Eq a) => [[[a]]] -> (Int, Int) -> a -> [[[a]]]
> appendTo [] (i,j) x = []
> appendTo xs (i,j) x =
>     let (preRs, theRow, postRs) = (take i xs, xs !! i, drop (i+1) xs)
>         (preCs, theCell, postCs) = (take j theRow, theRow !! j, drop (j+1) theRow)
>         newCell = nub (x : theCell) -- ensure no duplicates

>     in  preRs ++ (preCs ++ newCell : postCs) : postRs

> appendTo2 :: (Eq a) => [[[a]]] -> (Int, Int) -> [a] -> [[[a]]]
> appendTo2 [] (i,j) x = []
> appendTo2 xs (i,j) x =
>     let (preRs, theRow, postRs) = (take i xs, xs !! i, drop (i+1) xs)
>         (preCs, theCell, postCs) = (take j theRow, theRow !! j, drop (j+1) theRow)
>         newCell = nub (x ++ theCell) -- ensure no duplicates

>     in  preRs ++ (preCs ++ newCell : postCs) : postRs


The parseDown1 function completes a parse from a particular cell and symbol. We
assume the symbol is a member of the cell in question.

> doAll = True

> parseDown1 :: (Eq a) => [[[a]]] -> [Rule a] -> [[[a]]] -> ((Int, Int), a) -> [[[[a]]]]
> parseDown1 rows rules newRows ((0,j),x) = [appendTo newRows (0,j) x]
> parseDown1 rows rules newRows ((i,j),x) =
>     let newRows' = appendTo newRows (i,j) x -- put x in the current table

>         pRules = filter (\(l,r) -> l==x && length r > 0) rules -- rules of form A->BC...N

>         sRules = filter (\(l,r) -> l==x && length r == 1) rules -- rules of form A->B

>         f r = getCells rows r i j -- get a rule's target cells (CAN BE >1 LIST!)

>         f2 r@(lhs,rhs) = map (zipWith (\a (b,c) -> ((b,c),a)) rhs) (f r) -- group with coords

>         --pCells type is [[[((Int, Int), a)]]] 

>         pCells = filter (\l -> not $ null l) $ map f2 pRules -- get cells according to pRules 

>         recCall pCell = parseDown1 rows rules newRows' pCell -- recurse on rule expansions

>         pTabs = map (map (map recCall)) pCells -- gives one SET of tables per pCell

>         pTabs' = map (map combineSets) pTabs -- gives one SET of tables per inner pCell list (can be >1)

>         pTabs'' = concat $ concat pTabs' -- flatten lists, the parses are done

>         syns = filter (/=x) $ map (\(l,r) -> head r) $ sRules -- what synonym symbols are there?

>         synResults = concatMap (\a -> parseDown1 rows rules newRows' ((i,j),a)) syns -- recurse on syns

>     in  synResults ++ pTabs''


The parseDown function takes the cyk rows, the grammar's rules, and the start symbol.

> parseDown rows rules s =
>     let n = length $ head rows
>     in  map (map (map reverse)) $ parseDown1 rows rules (emptyRows n) ((n-1,0),s)

 xtrs = [(1,[1,1]), (1,[1,2]), (2,[2,2]), (3, [1]), (1, [1])] :: [Rule Int]
 xstr = [1,1,1,1] :: [Int]
 xp = allRowsMS 2 xtrs xstr
 xtest1 = parseDown xp xtrs 3

> emptyRows n =
>     if n <= 0 then [] else take n (repeat []) : emptyRows (n-1)

2 x
1 x x
0 x x x
  0 1 2

The combineSets function takes a bunch of table sets, one set per
cell that has been parsed, and finds every combination of them.

> combineSets :: (Eq a) => [[[[[a]]]]] -> [[[[a]]]]
> combineSets [] = error "(combineSets) No sets to combine!"
> combineSets [tset] = tset
> combineSets (tset:moreSets) =
>     let pairs = [(a,b) | a<-tset, b<-combineSets moreSets]
>     in  map (\(a,b) -> combine1 a b) pairs


> -- merges N tables

> combineN :: (Eq a) => [[[[a]]]] -> [[[a]]]
> combineN [] = error "(combineN) No tables to combine!"
> combineN [t] = t
> combineN (t:ts) = combine1 t (combineN ts)

> -- merges two tables

> combine1 :: (Eq a) => [[[a]]] -> [[[a]]] -> [[[a]]]
> combine1 tab1 tab2 =
>     let n = length $ head tab1
>         is = [(i,j) | i<-[0..n-1], j<-[0..n-1], j<=n-i-1]
>         xs = map (\(i,j) -> tab2 !! i !! j) is
>     in  foldUpdate2 tab1 (zip is xs)

> foldUpdate :: (Eq a) => [[[a]]] -> [((Int, Int), a)] -> [[[a]]]
> foldUpdate table [] = table
> foldUpdate table ((c,x):xs) = foldUpdate (appendTo table c x) xs


> foldUpdate2 :: (Eq a) => [[[a]]] -> [((Int, Int), [a])] -> [[[a]]]
> foldUpdate2 table [] = table
> foldUpdate2 table ((c,x):xs) = foldUpdate2 (appendTo2 table c x) xs