-- ----------------------------------------------------------------------------- -- -- AbsSyn.hs, part of Alex -- -- (c) Chris Dornan 1995-2000, Simon Marlow 2003 -- -- This module provides a concrete representation for regular expressions and -- scanners. Scanners are used for tokenising files in preparation for parsing. -- -- ----------------------------------------------------------------------------} module AbsSyn ( Code, Directive(..), Scheme(..), wrapperCppDefs, Scanner(..), RECtx(..), RExp(..), nullable, DFA(..), State(..), SNum, StartCode, Accept(..), RightContext(..), showRCtx, encodeStartCodes, extractActions, Target(..), UsesPreds(..), usesPreds, StrType(..) ) where import CharSet ( CharSet, Encoding ) import Data.Maybe ( fromJust ) import Data.Map ( Map ) import Data.IntMap ( IntMap ) import Util ( str, nl ) import qualified Data.Map as Map import qualified Data.Set as Set infixl 4 :|| infixl 5 :%% -- ----------------------------------------------------------------------------- -- Abstract Syntax for Alex scripts type Code = String data Directive = WrapperDirective String -- use this wrapper | EncodingDirective Encoding -- use this encoding | ActionType String -- Type signature of actions, -- with optional typeclasses | TypeClass String | TokenType String deriving Show data StrType = Str | Lazy | Strict | StrictText deriving Eq instance Show StrType where show Str = "String" show Lazy = "ByteString.ByteString" show Strict = "ByteString.ByteString" show StrictText = "Data.Text.Text" data Scheme = Default { defaultTypeInfo :: Maybe (Maybe String, String) } | GScan { gscanTypeInfo :: Maybe (Maybe String, String) } | Basic { basicStrType :: StrType, basicTypeInfo :: Maybe (Maybe String, String) } | Posn { posnStrType :: StrType, posnTypeInfo :: Maybe (Maybe String, String) } | Monad { monadStrType :: StrType, monadUserState :: Bool, monadTypeInfo :: Maybe (Maybe String, String) } wrapperCppDefs :: Scheme -> Maybe [String] wrapperCppDefs Default {} = Nothing wrapperCppDefs GScan {} = Just ["ALEX_GSCAN"] wrapperCppDefs Basic { basicStrType = Str } = Just ["ALEX_BASIC"] wrapperCppDefs Basic { basicStrType = Lazy } = Just ["ALEX_BASIC_BYTESTRING"] wrapperCppDefs Basic { basicStrType = Strict } = Just ["ALEX_STRICT_BYTESTRING"] wrapperCppDefs Basic { basicStrType = StrictText } = Just ["ALEX_STRICT_TEXT"] wrapperCppDefs Posn { posnStrType = Str } = Just ["ALEX_POSN"] wrapperCppDefs Posn { posnStrType = Lazy } = Just ["ALEX_POSN_BYTESTRING"] wrapperCppDefs Posn { posnStrType = Strict } = Just ["ALEX_POSN_BYTESTRING"] wrapperCppDefs Posn { posnStrType = StrictText } = Just ["ALEX_POSN_STRICT_TEXT"] wrapperCppDefs Monad { monadStrType = Str, monadUserState = False } = Just ["ALEX_MONAD"] wrapperCppDefs Monad { monadStrType = Strict, monadUserState = False } = Just ["ALEX_MONAD_BYTESTRING"] wrapperCppDefs Monad { monadStrType = Lazy, monadUserState = False } = Just ["ALEX_MONAD_BYTESTRING"] wrapperCppDefs Monad { monadStrType = StrictText, monadUserState = False } = Just ["ALEX_MONAD_STRICT_TEXT"] wrapperCppDefs Monad { monadStrType = Str, monadUserState = True } = Just ["ALEX_MONAD", "ALEX_MONAD_USER_STATE"] wrapperCppDefs Monad { monadStrType = Strict, monadUserState = True } = Just ["ALEX_MONAD_BYTESTRING", "ALEX_MONAD_USER_STATE"] wrapperCppDefs Monad { monadStrType = Lazy, monadUserState = True } = Just ["ALEX_MONAD_BYTESTRING", "ALEX_MONAD_USER_STATE"] wrapperCppDefs Monad { monadStrType = StrictText, monadUserState = True } = Just ["ALEX_MONAD_STRICT_TEXT", "ALEX_MONAD_USER_STATE"] -- TODO: update this comment -- -- A `Scanner' consists of an association list associating token names with -- regular expressions with context. The context may include a list of start -- codes, some leading context to test the character immediately preceding the -- token and trailing context to test the residual input after the token. -- -- The start codes consist of the names and numbers of the start codes; -- initially the names only will be generated by the parser, the numbers being -- allocated at a later stage. Start codes become meaningful when scanners are -- converted to DFAs; see the DFA section of the Scan module for details. data Scanner = Scanner { scannerName :: String, scannerTokens :: [RECtx] } deriving Show data RECtx = RECtx { reCtxStartCodes :: [(String,StartCode)], reCtxPreCtx :: Maybe CharSet, reCtxRE :: RExp, reCtxPostCtx :: RightContext RExp, reCtxCode :: Maybe Code } data RightContext r = NoRightContext | RightContextRExp r | RightContextCode Code deriving (Eq,Ord) instance Show RECtx where showsPrec _ (RECtx scs _ r rctx code) = showStarts scs . shows r . showRCtx rctx . showMaybeCode code showMaybeCode :: Maybe String -> String -> String showMaybeCode Nothing = id showMaybeCode (Just code) = showCode code showCode :: String -> String -> String showCode code = showString " { " . showString code . showString " }" showStarts :: [(String, StartCode)] -> String -> String showStarts [] = id showStarts scs = shows scs showRCtx :: Show r => RightContext r -> String -> String showRCtx NoRightContext = id showRCtx (RightContextRExp r) = ('\\':) . shows r showRCtx (RightContextCode code) = showString "\\ " . showCode code -- ----------------------------------------------------------------------------- -- DFAs data DFA s a = DFA { dfa_start_states :: [s], dfa_states :: Map s (State s a) } data State s a = State { state_acc :: [Accept a], state_out :: IntMap s -- 0..255 only } type SNum = Int data Accept a = Acc { accPrio :: Int, accAction :: Maybe a, accLeftCtx :: Maybe CharSet, -- cannot be converted to byteset at this point. accRightCtx :: RightContext SNum } deriving (Eq,Ord) -- debug stuff instance Show (Accept a) where showsPrec _ (Acc p _act _lctx _rctx) = shows p --TODO type StartCode = Int -- ----------------------------------------------------------------------------- -- Predicates / contexts -- we can generate somewhat faster code in the case that -- the lexer doesn't use predicates data UsesPreds = UsesPreds | DoesntUsePreds deriving Eq usesPreds :: DFA s a -> UsesPreds usesPreds dfa | any acceptHasCtx [ acc | st <- Map.elems (dfa_states dfa) , acc <- state_acc st ] = UsesPreds | otherwise = DoesntUsePreds where acceptHasCtx Acc { accLeftCtx = Nothing , accRightCtx = NoRightContext } = False acceptHasCtx _ = True -- ----------------------------------------------------------------------------- -- Regular expressions -- `RExp' provides an abstract syntax for regular expressions. `Eps' will -- match empty strings; `Ch p' matches strings containing a single character -- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of -- its prefixes and `re2' matches the rest; `re1 :|| re2' matches a string if -- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be -- expressed in terms of the other operators. See the definitions of `ARexp' -- for a formal definition of the semantics of these operators. data RExp = Eps -- ^ Empty. | Ch CharSet -- ^ Singleton. | RExp :%% RExp -- ^ Sequence. | RExp :|| RExp -- ^ Alternative. | Star RExp -- ^ Zero or more repetitions. | Plus RExp -- ^ One or more repetitions. | Ques RExp -- ^ Zero or one repetitions. instance Show RExp where showsPrec _ Eps = showString "()" showsPrec _ (Ch _) = showString "[..]" showsPrec _ (l :%% r) = shows l . shows r showsPrec _ (l :|| r) = shows l . ('|':) . shows r showsPrec _ (Star r) = shows r . ('*':) showsPrec _ (Plus r) = shows r . ('+':) showsPrec _ (Ques r) = shows r . ('?':) -- | A regular expression is nullable if it matches the empty string. nullable :: RExp -> Bool nullable Eps = True nullable Ch{} = False nullable (l :%% r) = nullable l && nullable r nullable (l :|| r) = nullable l || nullable r nullable Star{} = True nullable (Plus r) = nullable r nullable Ques{} = True {------------------------------------------------------------------------------ Abstract Regular Expression ------------------------------------------------------------------------------} -- This section contains demonstrations; it is not part of Alex. {- -- This function illustrates `ARexp'. It returns true if the string in its -- argument is matched by the regular expression. recognise:: RExp -> String -> Bool recognise re inp = any (==len) (ap_ar (arexp re) inp) where len = length inp -- `ARexp' provides an regular expressions in abstract format. Here regular -- expressions are represented by a function that takes the string to be -- matched and returns the sizes of all the prefixes matched by the regular -- expression (the list may contain duplicates). Each of the `RExp' operators -- are represented by similarly named functions over ARexp. The `ap' function -- takes an `ARExp', a string and returns the sizes of all the prefixes -- matching that regular expression. `arexp' converts an `RExp' to an `ARexp'. arexp:: RExp -> ARexp arexp Eps = eps_ar arexp (Ch p) = ch_ar p arexp (re :%% re') = arexp re `seq_ar` arexp re' arexp (re :|| re') = arexp re `bar_ar` arexp re' arexp (Star re) = star_ar (arexp re) arexp (Plus re) = plus_ar (arexp re) arexp (Ques re) = ques_ar (arexp re) star_ar:: ARexp -> ARexp star_ar sc = eps_ar `bar_ar` plus_ar sc plus_ar:: ARexp -> ARexp plus_ar sc = sc `seq_ar` star_ar sc ques_ar:: ARexp -> ARexp ques_ar sc = eps_ar `bar_ar` sc -- Hugs abstract type definition -- not for GHC. type ARexp = String -> [Int] -- in ap_ar, eps_ar, ch_ar, seq_ar, bar_ar ap_ar:: ARexp -> String -> [Int] ap_ar sc = sc eps_ar:: ARexp eps_ar inp = [0] ch_ar:: (Char->Bool) -> ARexp ch_ar p "" = [] ch_ar p (c:rst) = if p c then [1] else [] seq_ar:: ARexp -> ARexp -> ARexp seq_ar sc sc' inp = [n+m| n<-sc inp, m<-sc' (drop n inp)] bar_ar:: ARexp -> ARexp -> ARexp bar_ar sc sc' inp = sc inp ++ sc' inp -} -- ----------------------------------------------------------------------------- -- Utils -- Map the available start codes onto [1..] encodeStartCodes:: Scanner -> (Scanner,[StartCode],ShowS) encodeStartCodes scan = (scan', 0 : map snd name_code_pairs, sc_hdr) where scan' = scan{ scannerTokens = map mk_re_ctx (scannerTokens scan) } mk_re_ctx (RECtx scs lc re rc code) = RECtx (map mk_sc scs) lc re rc code mk_sc (nm,_) = (nm, if nm=="0" then 0 else fromJust (Map.lookup nm code_map)) sc_hdr tl = case name_code_pairs of [] -> tl (nm,_):rst -> "\n" ++ nm ++ foldr f t rst where f (nm', _) t' = "," ++ nm' ++ t' t = " :: Int\n" ++ foldr fmt_sc tl name_code_pairs where fmt_sc (nm,sc) t = nm ++ " = " ++ show sc ++ "\n" ++ t code_map = Map.fromList name_code_pairs name_code_pairs = zip nms [1..] nms = Set.toAscList . Set.fromList $ [ nm | RECtx{ reCtxStartCodes = scs } <- scannerTokens scan , (nm, _) <- scs , nm /= "0" ] -- Grab the code fragments for the token actions, and replace them -- with function names of the form alex_action_$n$. We do this -- because the actual action fragments might be duplicated in the -- generated file. extractActions :: Scheme -> Scanner -> (Scanner,ShowS) extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str . nl) where (new_tokens, decls) = unzip (zipWith f (scannerTokens scanner) act_names) f r@(RECtx{ reCtxCode = Just code }) name = (r{reCtxCode = Just name}, Just (mkDecl name code)) f r@(RECtx{ reCtxCode = Nothing }) _ = (r{reCtxCode = Nothing}, Nothing) gscanActionType res = str "AlexPosn -> Char -> String -> Int -> ((Int, state) -> " . str res . str ") -> (Int, state) -> " . str res mkDecl fun code = mkTySig fun . mkDef fun code mkDef fun code = str fun . str " = " . str code . nl mkTySig fun = case scheme of Default { defaultTypeInfo = Just (Nothing, actionty) } -> nl . str fun . str " :: " . str actionty . nl Default { defaultTypeInfo = Just (Just tyclasses, actionty) } -> nl . str fun . str " :: (" . str tyclasses . str ") => " . str actionty . nl GScan { gscanTypeInfo = Just (Nothing, tokenty) } -> nl . str fun . str " :: " . gscanActionType tokenty . nl GScan { gscanTypeInfo = Just (Just tyclasses, tokenty) } -> nl . str fun . str " :: (" . str tyclasses . str ") => " . gscanActionType tokenty . nl Basic { basicStrType = strty, basicTypeInfo = Just (Nothing, tokenty) } -> nl . str fun . str " :: " . str (show strty) . str " -> " . str tokenty . nl Basic { basicStrType = strty, basicTypeInfo = Just (Just tyclasses, tokenty) } -> nl . str fun . str " :: (" . str tyclasses . str ") => " . str (show strty) . str " -> " . str tokenty . nl Posn { posnStrType = strty, posnTypeInfo = Just (Nothing, tokenty) } -> nl . str fun . str " :: AlexPosn -> " . str (show strty) . str " -> " . str tokenty . nl Posn { posnStrType = strty, posnTypeInfo = Just (Just tyclasses, tokenty) } -> nl . str fun . str " :: (" . str tyclasses . str ") => AlexPosn -> " . str (show strty) . str " -> " . str tokenty . nl Monad { monadStrType = strty, monadTypeInfo = Just (Nothing, tokenty) } -> nl . let actintty = if strty == Lazy then "Int64" else "Int" in str fun . str " :: AlexInput -> " . str actintty . str " -> Alex (" . str tokenty . str ")" . nl Monad { monadStrType = strty, monadTypeInfo = Just (Just tyclasses, tokenty) } -> nl . let actintty = if strty == Lazy then "Int64" else "Int" in str fun . str " :: (" . str tyclasses . str ") =>" . str " AlexInput -> " . str actintty . str " -> Alex (" . str tokenty . str ")" . nl _ -> id act_names = map (\n -> "alex_action_" ++ show (n::Int)) [0..] decl_str :: ShowS decl_str = foldr (.) id [ decl | Just decl <- decls ] -- ----------------------------------------------------------------------------- -- Code generation targets data Target = GhcTarget | HaskellTarget deriving Eq