-- | The basic data types for formal languages up to and including context-free -- grammars. -- -- TODO we shall have to extend the system for multi-tape grammars to allow -- combined terminal/non-terminal systems. This will basically mean dealing -- with context-sensitive grammars, at which we can just fully generalize -- everything. -- -- TODO we need a general system to generate fresh variable names of varying -- dimension. This is very much desired for certain operations (especially on -- products). -- -- BIGTODO @E _@ are actually the "None" thing in ADPfusion; while normal -- epsilons are just terminals. module FormalLanguage.CFG.Grammar ( module FormalLanguage.CFG.Grammar.Types , module FormalLanguage.CFG.Grammar.Util ) where import FormalLanguage.CFG.Grammar.Types import FormalLanguage.CFG.Grammar.Util {- import Data.List (partition,sort,nub) import Data.Monoid -} {- -- | A grammar with normalized start and epsilon symbols (i) has a start -- symbol, whose RHSs only point to single syntactic variables. (ii) It has -- terminating rules only when the single RHS symbol is a real epsilon symbol. normalizeStartEpsilon :: Grammar -> Grammar normalizeStartEpsilon g = gE -- good start rules go from the start symbol to a single syntactic -- symbol. It should not be the start symbol. where srs = [r | r<-g^..rules.folded , r^.lhs == g^.start] (gsr,bsr) = partition goodStartRule srs goodStartRule (Rule _ _ [r]) | isSyntactic r && r /= g^.start = True goodStartRule _ = False -- now we need to process the start rules. We create a fresh synvar, -- and the corresponding rules. d = dim g s = freshStartSymbol g sf = freshStartFun g srs' = [Rule s (replicate d sf) [r^.lhs] | r<-srs] gS = if null bsr then g else (g & rules %~ S.union (S.fromList srs')) & start .~ s -- otherwise, add new rules, set new start symbol -- good epsilon rules go from a syntactic variable directly to epsilon -- with no additional symbols on the RHS. ers = [r | r<-gS^..rules.folded, any isEpsilon (r^.rhs)] (ger,ber) = partition goodEpsilonRule ers goodEpsilonRule (Rule _ _ [r]) | isEpsilon r = True goodEpsilonRule _ = False -- same for the epsilon rules e = freshTermSynVar gS ef = freshTermFun gS ers' = concat [ [ Rule e (replicate d ef) [r] , Rule l f (rsl++[e]++rsr) ] | (Rule l f rs') <- ers , let (rsl,(r:rsr)) = span (not . isEpsilon) rs' ] gE = if null ber then gS else (gS & rules %~ (S.\\ S.fromList ers)) & rules %~ S.union (S.fromList ers') -- otherwise, replace old rules -- | Given a grammar, generate a fresh start syntactic variable with a name -- that is not "too weird". Also transfers any index structure to the start -- symbol. -- -- TODO if indices are correctly transferred needs be closely checked. freshStartSymbol :: Grammar -> Symbol freshStartSymbol g | d == 0 = error "zero-dim grammar" | otherwise = zipWith SynVar (replicate d x) ix where ss = ["S"] ++ (map (++"'") $ g^..start.folded.name) ++ map (\i -> "S" ++ show i) [1 :: Int ..] x = head $ dropWhile (`M.member` (g^.synvars)) ss d = dim g ix :: [[Index]] ix = g^..start.folded.index -- | Given a grammar, generate a fresh terminating syntactic variable (that -- only traverses to @Epsilon@ rules), that is not "too weird". freshTermSynVar :: Grammar -> Symbol freshTermSynVar g | d == 0 = error "zero-dim grammar" | otherwise = replicate d (SynVar e []) where es = ["E"] ++ map (\i -> "E" ++ show i) [1 :: Int .. ] e = head $ dropWhile (`M.member` (g^.synvars)) es d = dim g -- | Create a fresh start function symbol. freshStartFun :: Grammar -> String freshStartFun g | S.null ks = error "no rules in grammar?" | otherwise = f where ks = S.fromList $ g^..rules.folded.attr.folded fs = ["fS"] ++ map (\i -> "fS" ++ show i) [1::Int ..] f = head $ dropWhile (`S.member` ks) fs -- | Create a fresh terminating transition function symbol freshTermFun :: Grammar -> String freshTermFun g | S.null ks = error "no rules in grammar?" | otherwise = f where ks = S.fromList $ g^..rules.folded.attr.folded fs = ["fE"] ++ map (\i -> "fE" ++ show i) [1::Int ..] f = head $ dropWhile (`S.member` ks) fs -- | Collect all terminal symbols from the rules (for cfg's it's not really -- needed to include the lhs). grammarTerminals :: Grammar -> [Symbol] grammarTerminals g = nub . sort . filter isTerminal $ (g^..rules.folded.lhs) ++ (g^..rules.folded.rhs.folded) -- | Collect all non-terminal symbols from the rules. -- -- TODO WARNING: Problems handling syn-terms in outside grammars. grammarSynVars :: Grammar -> [Symbol] grammarSynVars g = nub . sort . filter isSyntactic $ (g^..rules.folded.lhs) ++ (g^..rules.folded.rhs.folded) -} {- -- * Basic data types for formal grammars. -- | Grammar indices are enumerable objects -- -- TODO should we always assume operations "modulo"? data Enumerable = Singular | IntBased Integer Integer -- current index, maximum index -- | Enumerated String [String] deriving (Eq,Ord,Show,Typeable,Data) _IntBased :: Prism' Enumerable (Integer,Integer) _IntBased = prism (uncurry IntBased) $ f where f Singular = Left Singular f (IntBased c m) = Right (c,m) ibCurrent = _IntBased . _1 ibModulus = _IntBased . _2 instance Default Enumerable where def = Singular -- | A single-dimensional terminal or non-terminal symbol. @E@ is a special -- symbol denoting that nothing should be done. -- -- TODO write Eq,Ord by hand. Fail with error if Enumerable is not equal (this -- should actually be caught in the combination operations). -- | 'T' – A terminal symbol (excluding epsilon) -- -- 'N' – A non-terminal symbol (again, excluding non-terminal epsilons) -- -- 'E' – Epsilon characters, may be named differently data TN where T :: String -> TN N :: String -> Enumerable -> TN E :: TN deriving instance Show TN deriving instance Eq TN deriving instance Ord TN deriving instance Typeable TN deriving instance Data TN isT = \case {T _ -> True; _ -> False} isN = \case {N _ _ -> True; _ -> False} isE = \case {E -> True; _ -> False} tnName :: Lens' TN String tnName f (T s ) = T <$> f s tnName f (N s e) = (\s' -> N s' e) <$> f s tnName f (E ) = (const E) <$> f "ε" _T :: Prism' TN String _T = prism T $ f where f (T s) = Right s f z = Left z _N :: Prism' TN (String,Enumerable) _N = prism (uncurry N) $ f where f (N s e) = Right (s,e) f z = Left z _E :: Prism' TN () _E = prism (const E) $ f where f E = Right () f z = Left z enumed = _N . _2 -- | Is a symbol of @Outside@ or @Inside@ type? data InsideOutside = Inside | Outside deriving (Show,Eq,Ord,Typeable,Data) -- | A complete grammatical symbol is multi-dimensional with 0.. -- dimensions. -- -- TODO we should expand this to three cases: (i) only terminals, (ii) only -- syntactic variables, (iii) mixed cases data Symb = Symb { inOut :: InsideOutside , getSymbs :: [TN] } deriving instance Show Symb deriving instance Eq Symb deriving instance Ord Symb deriving instance Typeable Symb deriving instance Data Symb symb :: Lens' Symb [TN] symb f (Symb io xs) = Symb io <$> f xs -- are we sure? symbInOut :: Lens' Symb InsideOutside symbInOut f (Symb io xs) = (`Symb` xs) <$> f io sDim :: Symb -> Int sDim = length . getSymbs type instance Index Symb = Int type instance IxValue Symb = TN instance Ixed Symb where ix k f (Symb io xs) = Symb io <$> ix k f xs {-# INLINE ix #-} -- | A production rule goes from a left-hand side (lhs) to a right-hand side -- (rhs). The rhs is evaluated using a function (fun). -- -- TODO These production rules currently do not allow "typical" -- context-sensitive grammars with terminal symbols on the left-hand side. data Rule = Rule { _lhs :: Symb , _fun :: [String] -- Fun , _rhs :: [Symb] } deriving (Eq,Ord,Show,Typeable,Data) makeLenses ''Rule -- | A complete grammar with a set of terminal symbol (tsyms), non-terminal -- symbol (nsyms), production rules (rules) and a start symbol (start). -- -- TODO Combined terminal and non-terminal symbols in multi-tape grammars are -- denoted as non-terminal symbols. -- -- TODO rename epsis -> esyms -- -- TODO tsyms, esyms are not symbols but should just be 1-dim things ... -- -- TODO nsyms should maybe be 1-dim, then have another thing for the multidim things data Grammar = Grammar { _tsyms :: Set Symb , _nsyms :: Set Symb , _nIsms :: Set Symb -- in case of an outside grammar, this contains the inside syntactic variables that now act as "kind-of" terminals , _epsis :: Set TN , _rules :: Set Rule , _start :: Maybe Symb , _name :: String } deriving (Show,Data,Typeable) makeLenses ''Grammar -- | Determines if this is an outside grammar isOutsideGrammar :: Grammar -> Bool isOutsideGrammar g = anyOf folded (\(Symb io _) -> io==Outside) $ g^.nsyms -- | the dimension of the grammar. Grammars with no symbols have dimension 0. gDim :: Grammar -> Int gDim g | Just (x,_) <- S.minView (g^.nsyms) = length $ x^.symb | Just (x,_) <- S.minView (g^.tsyms) = length $ x^.symb | otherwise = 0 -- | Helper function giving the grammar name. Will add an @Outside@ prefix, -- where necessary. grammarName :: Grammar -> String grammarName g = if isOutsideGrammar g then "Outside" ++ g^.name else "" ++ g^.name -- * Helper functions on rules and symbols. -- | Symb is completely in terminal form. isSymbT :: Symb -> Bool isSymbT (Symb io xs) = allOf folded tTN xs && anyOf folded (\case (T _) -> True ; _ -> False) xs tTN :: TN -> Bool tTN (T _ ) = True tTN (E ) = True tTN (N _ _) = False isSymbE :: Symb -> Bool isSymbE (Symb io xs) = allOf folded (\case E -> True ; _ -> False) xs -- | Symb is completely in non-terminal form. isSymbN :: Symb -> Bool isSymbN (Symb io xs) = allOf folded nTN xs && anyOf folded (\case (N _ _) -> True ; _ -> False) xs {- -- | Generalized non-terminal symbol with at least one non-terminal Symb. nSymbG :: Symb -> Bool nSymbG (Symb xs) = allOf folded nTN xs && anyOf folded (\case (N _ _) -> True ; _ -> False) xs -} nTN :: TN -> Bool nTN (N _ _) = True nTN (E ) = True nTN (T _ ) = False -- * Determine grammar types -- -- For grammars where the number of non-terminal symbols is restricted, we -- allow as non-terminal also the generalized variants that have partial -- terminal symbols. -- -- TODO maybe restrict those to epsilon-type terminals in generalized -- non-terminals. -- | Right-linear grammars have at most one non-terminal on the RHS. It is the -- last symbol. isRightLinear :: Grammar -> Bool isRightLinear g = allOf folded isrl $ g^.rules where isrl :: Rule -> Bool isrl (Rule l _ []) = isSymbN l isrl (Rule l _ rs) = isSymbN l && (allOf folded (not . isSymbN) $ init rs) -- | Linear grammars just have a single non-terminal on the right-hand side. isLinear :: Grammar -> Bool isLinear g = error "isLinear: write me" -- allOf folded ((<=1) . length . filter nSymbG -- * Different normal forms for grammars. -- | Transform a grammar into CNF. (cf. COL-2007) -- -- TODO make sure we use a variant that computes small grammars. chomskyNF :: Grammar -> Grammar chomskyNF = error "chomsky" isChomskyNF :: Grammar -> Bool isChomskyNF g = allOf folded isC $ g^.rules where isC :: Rule -> Bool isC (Rule _ _ [s]) = isSymbT s isC (Rule _ _ [s,t]) = isSymbN s && isSymbN t isC _ = False -- | Transform grammar into GNF. -- -- http://dl.acm.org/citation.cfm?id=321254 greibachNF :: Grammar -> Grammar greibachNF = error "gnf" -- | Check if grammar is in Greibach Normal Form isGreibachNF :: Grammar -> Bool isGreibachNF g = allOf folded isG $ g^.rules where isG :: Rule -> Bool isG (Rule _ _ (t:ns)) = isSymbT t && all isSymbN ns isG _ = False -- | A grammar is epsilon-free if no rule has an empty RHS, resp. any rhs-symb -- is completely non-empty. -- -- TODO we should tape-split multi-tape grammars here and make sure that all -- individual tapes are epsilon-free as otherwise we can generate cases where -- the epsilon-aligned symbols lead to weird problems. So split into single -- tapes, then check. epsilonFree :: Grammar -> Bool epsilonFree g = allOf folded eFree $ g^.rules where eFree :: Rule -> Bool eFree (Rule l _ r) = undefined -- l == g^.start || (not $ null r) || anyOf folded (epsFree $ g^.start) r epsFree :: Symb -> Symb -> Bool epsFree = undefined -- | Collect all non-terminal symbols from the rules collectSymbN :: Grammar -> [Symb] collectSymbN g = nub . sort . filter isSymbN $ (g^..rules.folded.lhs) ++ (g^..rules.folded.rhs.folded) -- | Collect the syntactic variable symbols for either inside or outside, -- depending on the grammar collectInOutSymbN :: Grammar -> [Symb] collectInOutSymbN g = filter f xs where xs = collectSymbN g f (Symb Outside _) = isO f (Symb Inside _) = not isO isO = isOutsideGrammar g -- | Collect all terminal symbols from the rules (for cfg's it's not really -- needed to include the lhs). collectSymbT :: Grammar -> [Symb] collectSymbT g = nub . sort . filter isSymbT $ (g^..rules.folded.lhs) ++ (g^..rules.folded.rhs.folded) -}