{-# LANGUAGE RecordWildCards #-} module NLP.Adict.DAWG ( DAWGD , DAWG (..) , fromTrie , fromDAWG , size , row , Row (..) , entry , charOn , valueIn , edges , edgeOn , serialize , deserialize ) where import Control.Applicative ((<$>)) import Data.Maybe (listToMaybe) import Data.Binary (Binary, get, put) import qualified Data.Vector as V import NLP.Adict.DAWG.Node import qualified NLP.Adict.Trie as Trie -- | A DAWGD dictionary is a 'DAWG' which may have the 'Nothing' value -- along the path from the root to a leave. type DAWGD a b = DAWG a (Maybe b) -- | A directed acyclic word graph with character type @a@ and dictionary -- entry type @b@. data DAWG a b = DAWG { root :: Int -- ^ Root (index) of the DAWG , array :: V.Vector (Row a b) -- ^ Vector of DAWG nodes } -- | Find and eliminate all common subtries in the input trie -- and return the trie represented as a DAWG. fromTrie :: (Ord a, Ord b) => Trie.Trie a b -> DAWG a b fromTrie = deserialize . Trie.serialize -- | Transform the DAWG to implicit DAWG in a form of a trie. fromDAWG :: Ord a => DAWG a b -> Trie.Trie a b fromDAWG = Trie.deserialize . serialize size :: DAWG a b -> Int size = V.length . array {-# INLINE size #-} row :: DAWG a b -> Int -> Row a b row dag k = array dag V.! k {-# INLINE row #-} -- | A Row represents one node of the DAWG. data Row a b = Row { -- | Value in the node. rowValue :: b, -- | Edges to subnodes (represented by array indices) -- annotated with characters. rowEdges :: V.Vector (a, Int) } valueIn :: DAWG a b -> Int -> b valueIn dag k = rowValue (array dag V.! k) {-# INLINE valueIn #-} edges :: DAWG a b -> Int -> [(a, Int)] edges dag k = V.toList . rowEdges $ row dag k {-# INLINE edges #-} edgeOn :: Eq a => DAWG a b -> Int -> a -> Maybe Int edgeOn DAWG{..} k x = let r = array V.! k in snd <$> V.find ((x==).fst) (rowEdges r) entry :: DAWG a (Maybe b) -> [Int] -> Maybe ([a], b) entry dag xs = do x <- mapM (charOn dag) (zip (root dag:xs) xs) r <- maybeLast xs >>= valueIn dag return (x, r) where maybeLast [] = Nothing maybeLast ys = Just $ last ys charOn :: DAWG a b -> (Int, Int) -> Maybe a charOn dag (root, x) = listToMaybe [c | (c, y) <- edges dag root, x == y] serialize :: Ord a => DAWG a b -> [Node a b] serialize = map unRow . V.toList . array -- | Assumptiom: root node is last in the serialization list. deserialize :: Ord a => [Node a b] -> DAWG a b deserialize xs = let arr = V.fromList $ map mkRow xs in DAWG (V.length arr - 1) arr unRow :: Ord a => Row a b -> Node a b unRow Row{..} = mkNode rowValue (V.toList rowEdges) {-# INLINE unRow #-} mkRow :: Ord a => Node a b -> Row a b mkRow n = Row (nodeValue n) (V.fromList $ nodeEdges n) {-# INLINE mkRow #-} instance (Ord a, Binary a, Binary b) => Binary (DAWG a b) where put = put . serialize get = deserialize <$> get -- goDown :: DAWG a -> Int -> DAWG a -- goDown DAWG{..} k = DAWG k array -- -- instance T.Trie DAWGArray where -- unTrie dag@DAWGArray{..} = -- let row = array V.! root -- in ( valueIn row -- , [ (c, goDown dag k) -- | (c, k) <- U.toList (edgeVec row) ] ) -- child x dag@DAWGArray{..} = -- let row = array V.! root -- in goDown dag <$> snd <$> U.find ((x==).fst) (edgeVec row)