{-# LANGUAGE BangPatterns #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} module Regex.Internal.Parser ( Parser(..) , Node(..) , compile , compileBounded , ParserState , prepareParser , stepParser , finishParser , Foldr , parseFoldr ) where import Control.Applicative import Control.Monad.Trans.State.Strict import Control.Monad.Fix import Data.Maybe (isJust) import Data.Primitive.SmallArray import qualified Data.Foldable as F import Regex.Internal.Regex (RE(..), Strictness(..), Greediness(..)) import Regex.Internal.Unique (Unique(..), UniqueSet) import qualified Regex.Internal.Unique as U ---------- -- Types ---------- -- | A parser compiled from a @'RE' c a@. data Parser c a where PToken :: !(c -> Maybe a) -> Parser c a PFmap :: !Strictness -> !(a1 -> a) -> !(Parser c a1) -> Parser c a PFmap_ :: !(Node c a) -> Parser c a PPure :: a -> Parser c a PLiftA2 :: !Strictness -> !(a1 -> a2 -> a) -> !(Parser c a1) -> !(Parser c a2) -> Parser c a PEmpty :: Parser c a PAlt :: !Unique -> !(Parser c a) -> !(Parser c a) -> !(SmallArray (Parser c a)) -> Parser c a PFoldGr :: !Unique -> !Strictness -> !(a -> a1 -> a) -> a -> !(Parser c a1) -> Parser c a PFoldMn :: !Unique -> !Strictness -> !(a -> a1 -> a) -> a -> !(Parser c a1) -> Parser c a PMany :: !Unique -> !(a1 -> a) -> !(a2 -> a) -> !(a2 -> a1 -> a2) -> !a2 -> !(Parser c a1) -> Parser c a -- | A node in the NFA. Used for recognition. data Node c a where NAccept :: a -> Node c a NGuard :: !Unique -> Node c a -> Node c a NToken :: !(c -> Maybe a1) -> !(Node c a) -> Node c a NEmpty :: Node c a NAlt :: !(Node c a) -> !(Node c a) -> !(SmallArray (Node c a)) -> Node c a -- Note that NGuard is lazy in the node. We have to introduce laziness in -- at least one place, to make a graph with loops possible. ------------ -- Compile ------------ -- | \(O(m)\). Compile a @RE c a@ to a @Parser c a@. -- -- Note: @compile@ does not limit the size of the @RE@. See 'compileBounded' -- if you would like to limit the size. -- @RE@s with size greater than @(maxBound::Int) \`div\` 2@ are not supported -- and the behavior of such a @RE@ is undefined. compile :: RE c a -> Parser c a compile re = evalState (compileToParser re) (Unique 0) nxtU :: State Unique Unique nxtU = state $ \u -> let !u' = Unique (unUnique u + 1) in (u, u') compileToParser :: RE c a -> State Unique (Parser c a) compileToParser re = case re of RToken t -> pure $ PToken t RFmap st f re1 -> PFmap st f <$> compileToParser re1 RFmap_ a re1 -> PFmap_ <$> compileToNode a re1 RPure a -> pure $ PPure a RLiftA2 st f re1 re2 -> liftA2 (PLiftA2 st f) (compileToParser re1) (compileToParser re2) REmpty -> pure PEmpty RAlt re01 re02 -> do u <- nxtU let (re1,re2,res) = gatherAlts re01 re02 p1 <- compileToParser re1 p2 <- compileToParser re2 ps <- traverse compileToParser res pure $ PAlt u p1 p2 (smallArrayFromList ps) RFold st gr f z re1 -> do u <- nxtU _localU <- nxtU case gr of Greedy -> PFoldGr u st f z <$> compileToParser re1 Minimal -> PFoldMn u st f z <$> compileToParser re1 RMany f1 f2 f z re1 -> do u <- nxtU _localU <- nxtU PMany u f1 f2 f z <$> compileToParser re1 compileToNode :: forall c a a1. a -> RE c a1 -> State Unique (Node c a) compileToNode a re0 = go re0 (NAccept a) where go :: forall a2. RE c a2 -> Node c a -> State Unique (Node c a) go re nxt = case re of RToken t -> pure $ NToken t nxt RFmap _ _ re1 -> go re1 nxt RFmap_ _ re1 -> go re1 nxt RPure _ -> pure nxt RLiftA2 _ _ re1 re2 -> go re2 nxt >>= go re1 REmpty -> pure NEmpty RAlt re01 re02 -> do u <- nxtU let nxt1 = NGuard u nxt (re1,re2,res) = gatherAlts re01 re02 n1 <- go re1 nxt1 n2 <- go re2 nxt1 ns <- traverse (flip go nxt1) res pure $ NAlt n1 n2 (smallArrayFromList ns) RFold _ gr _ _ re1 -> goMany gr re1 nxt RMany _ _ _ _ re1 -> goMany Greedy re1 nxt goMany :: forall a2. Greediness -> RE c a2 -> Node c a -> State Unique (Node c a) goMany gr re1 nxt = do u <- nxtU mfix $ \n -> do ndown <- go re1 n case gr of Greedy -> pure $ NGuard u (NAlt ndown nxt emptySmallArray) Minimal -> pure $ NGuard u (NAlt nxt ndown emptySmallArray) gatherAlts :: RE c a -> RE c a -> (RE c a, RE c a, [RE c a]) gatherAlts re01 re02 = case go re01 (go re02 []) of re11:re12:res -> (re11, re12, res) _ -> errorWithoutStackTrace "Regex.Internal.Parser.gatherAlts: impossible" where go (RAlt re1 re2) = go re1 . go re2 go re = (re:) -------------------- -- Compile bounded -------------------- -- | \(O(\min(l,m))\). Compile a @RE c a@ to a @Parser c a@. -- -- Returns @Nothing@ if the size of the @RE@ is greater than the provided limit -- \(l\). You may want to use this if you suspect that the @RE@ may be too -- large, for instance if the regex is constructed from an untrusted source. -- -- While the exact size of a @RE@ depends on an internal representation, it can -- be assumed to be in the same order as the length of a -- [regex pattern](https://en.wikipedia.org/wiki/Regular_expression#Syntax) -- corresponding to the @RE@. compileBounded :: Int -> RE c a -> Maybe (Parser c a) compileBounded lim re = if checkSize lim re then Just $! compile re else Nothing checkSize :: Int -> RE c a -> Bool checkSize lim re0 = isJust (evalStateT (go re0) 0) where go :: RE c a1 -> StateT Int Maybe () go re = case re of RToken _ -> inc RFmap _ _ re1 -> inc *> go re1 RFmap_ _ re1 -> inc *> go re1 RPure _ -> inc RLiftA2 _ _ re1 re2 -> inc *> go re1 *> go re2 REmpty -> inc RAlt re1 re2 -> inc *> go re1 *> go re2 RMany _ _ _ _ re1 -> inc *> go re1 RFold _ _ _ _ re1 -> inc *> go re1 inc = do n <- get if n == lim then empty else put $! n+1 ---------- -- Parse ---------- data Cont c b a where CTop :: Cont c a a CFmap :: !Strictness -> !(b -> a1) -> !(Cont c a1 a) -> Cont c b a CFmap_ :: !(Node c a1) -> !(Cont c a1 a) -> Cont c b a CLiftA2A :: !Strictness -> !(b -> a2 -> a3) -> !(Parser c a2) -> !(Cont c a3 a) -> Cont c b a CLiftA2B :: !Strictness -> !(a1 -> b -> a3) -> a1 -> !(Cont c a3 a) -> Cont c b a CAlt :: !Unique -> !(Cont c b a) -> Cont c b a CFoldGr :: !Unique -> !Strictness -> !(Parser c b) -> !(a1 -> b -> a1) -> a1 -> !(Cont c a1 a) -> Cont c b a CFoldMn :: !Unique -> !Strictness -> !(Parser c b) -> !(a1 -> b -> a1) -> a1 -> !(Cont c a1 a) -> Cont c b a CMany :: !Unique -> !(Parser c b) -> !(b -> a2) -> !(a1 -> a2) -> !(a1 -> b -> a1) -> !a1 -> !(Cont c a2 a) -> Cont c b a data NeedCList c a where NeedCCons :: !(c -> Maybe b) -> !(Cont c b a) -> !(NeedCList c a) -> NeedCList c a NeedCNil :: NeedCList c a data StepState c a = StepState { sSet :: {-# UNPACK #-} !UniqueSet , sNeed :: !(NeedCList c a) , sResult :: !(Maybe a) } stepStateZero :: StepState c a stepStateZero = StepState U.empty NeedCNil Nothing -- Note: Ideally we would have -- down :: Parser c b -> Cont c b a -> State (StepState c a) () -- and similar downNode and up, but GHC is unable to optimize it to be -- equivalent to the current code. -- -- Using State is pretty convenient though, so it is used in branches. This -- seems to get optimized well enough. sMember :: Unique -> State (StepState c a) Bool sMember u = gets $ \pt -> U.member u (sSet pt) sInsert :: Unique -> State (StepState c a) () sInsert u = modify' $ \pt -> pt { sSet = U.insert u (sSet pt) } down :: Parser c b -> Cont c b a -> StepState c a -> StepState c a down p !ct !pt = case p of PToken t -> pt { sNeed = NeedCCons t ct (sNeed pt) } PFmap st f p1 -> down p1 (CFmap st f ct) pt PFmap_ n -> downNode n ct pt PPure b -> up b ct pt PLiftA2 st f p1 p2 -> down p1 (CLiftA2A st f p2 ct) pt PEmpty -> pt PAlt u p1 p2 ps -> let ct1 = CAlt u ct in F.foldl' (\pt' p' -> down p' ct1 pt') (down p2 ct1 (down p1 ct1 pt)) ps PFoldGr u st f z p1 -> flip execState pt $ unlessM (sMember u) $ do sInsert (localU u) modify' $ down p1 (CFoldGr u st p1 f z ct) unlessM (sMember u) $ do sInsert u modify' $ up z ct PFoldMn u st f z p1 -> flip execState pt $ unlessM (sMember u) $ do unlessM (sMember (localU u)) $ do modify' $ up z ct sInsert u modify' $ down p1 (CFoldMn u st p1 f z ct) PMany u f1 f2 f z p1 -> flip execState pt $ unlessM (sMember u) $ do sInsert (localU u) modify' $ down p1 (CMany u p1 f1 f2 f z ct) unlessM (sMember u) $ do sInsert u let !x = f2 z modify' $ up x ct downNode :: Node c b -> Cont c b a -> StepState c a -> StepState c a downNode n0 !ct = go n0 where go n !pt = case n of NAccept b -> up b ct pt NGuard u n1 | U.member u (sSet pt) -> pt | otherwise -> go n1 (pt { sSet = U.insert u (sSet pt) }) NToken t nxt -> pt { sNeed = NeedCCons t (CFmap_ nxt ct) (sNeed pt) } NEmpty -> pt NAlt n1 n2 ns -> F.foldl' (flip go) (go n2 (go n1 pt)) ns up :: b -> Cont c b a -> StepState c a -> StepState c a up b ct !pt = case ct of CTop -> pt { sResult = sResult pt <|> Just b } CFmap st f ct1 -> case st of Strict -> let !x = f b in up x ct1 pt NonStrict -> up (f b) ct1 pt CFmap_ n ct1 -> downNode n ct1 pt CLiftA2A st f p1 ct1 -> down p1 (CLiftA2B st f b ct1) pt CLiftA2B st f a ct1 -> case st of Strict -> let !x = f a b in up x ct1 pt NonStrict -> up (f a b) ct1 pt CAlt u ct1 -> flip execState pt $ unlessM (sMember u) $ do sInsert u modify' $ up b ct1 CFoldGr u st p1 f z ct1 -> flip execState pt $ unlessM (sMember u) $ do lc <- sMember (localU u) if lc then do sInsert u modify' $ up z ct1 else do let go z1 = do modify' $ down p1 (CFoldGr u st p1 f z1 ct1) sInsert u modify' $ up z1 ct1 {-# INLINE go #-} case st of Strict -> let !z1 = f z b in go z1 NonStrict -> go (f z b) CFoldMn u st p1 f z ct1 -> flip execState pt $ unlessM (sMember u) $ do let go z1 = do sInsert (localU u) modify' $ up z1 ct1 unlessM (sMember u) $ do sInsert u modify' $ down p1 (CFoldMn u st p1 f z1 ct1) {-# INLINE go #-} case st of Strict -> let !z1 = f z b in go z1 NonStrict -> go (f z b) CMany u p1 f1 f2 f z ct1 -> flip execState pt $ unlessM (sMember u) $ do lc <- sMember (localU u) if lc then do sInsert u let !x = f1 b modify' $ up x ct1 else do let !z1 = f z b modify' $ down p1 (CMany u p1 f1 f2 f z1 ct1) sInsert u let !x = f2 z1 modify' $ up x ct1 localU :: Unique -> Unique localU = Unique . (+1) . unUnique -------------------- -- Running a Parser -------------------- -- | The state maintained for parsing. data ParserState c a = ParserState { psNeed :: !(NeedCList c a) , psResult :: !(Maybe a) } -- | \(O(m \log m)\). Prepare a parser for input. prepareParser :: Parser c a -> ParserState c a prepareParser p = toParserState (down p CTop stepStateZero) -- | \(O(m \log m)\). Step a parser by feeding a single element @c@. Returns -- @Nothing@ if the parse has failed regardless of further input. Otherwise, -- returns an updated @ParserState@. stepParser :: ParserState c a -> c -> Maybe (ParserState c a) stepParser ps c = case psNeed ps of NeedCNil -> Nothing needs -> Just $! toParserState (go needs) where go (NeedCCons t ct rest) = let !pt = go rest in maybe pt (\b -> up b ct pt) (t c) go NeedCNil = stepStateZero {-# INLINE stepParser #-} -- | \(O(1)\). Get the parse result for the input fed into the parser so far. finishParser :: ParserState c a -> Maybe a finishParser = psResult toParserState :: StepState c a -> ParserState c a toParserState pt = ParserState { psNeed = sNeed pt , psResult = sResult pt } -- | A fold function. type Foldr f a = forall b. (a -> b -> b) -> b -> f -> b -- | \(O(mn \log m)\). Run a parser given a sequence @f@ and a fold of @f@. parseFoldr :: Foldr f c -> Parser c a -> f -> Maybe a parseFoldr fr = \p xs -> fr f finishParser xs (prepareParser p) where f c k = \ps -> stepParser ps c >>= k {-# INLINE parseFoldr #-} --------- -- Util --------- unlessM :: Monad m => m Bool -> m () -> m () unlessM mb mx = do b <- mb if b then pure () else mx ---------- -- Notes ---------- -- Note [About the algorithm] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- To parse using a regex, we compile the regex into a non-deterministic finite -- automata (NFA). Actually, we only only do this for recognition, i.e. checking -- whether a sequence satisfies a regex. This is done if the regex is a RFmap_. -- -- To parse into a value, we have to do more work. We keep the regex as a tree -- (Parser), but we preserve the path taken down the tree, like a zipper. -- This lets us go up the tree and continue parsing, once we parse a c. -- If you squint your eyes, this is also an NFA, only each edge of the NFA is -- broken into multiple steps up and down the tree. -- -- Recognition using the NFA is faster than parsing, unsurprisingly. -- A Parser tree can have NFAs as children. This means that if some subtree of -- the regex only attempts to recognize some input, it doesn't pay the extra -- cost of parsing. -- -- Key objective: O(mn log m) time parsing. This means that for every c fed into -- the parser, we are allowed to take no more than O(m log m) time. -- -- How is this ensured? -- 1. The compiled regex must have O(m) nodes and O(m) edges. The Parser tree -- satisfies this, of course, since it reflects the regex itself. The NFA -- also satisfies this, implemented as Thompson's construction: -- https://en.wikipedia.org/wiki/Thompson%27s_construction -- 2. For every c, no edge is traversed twice. Tree edges are bidirectional -- unlike NFA edges, so an NFA edge may be traversed only once and a tree -- edge may be traversed once in each direction. -- -- NFA guards: To ensure each NFA edge can be traversed only once, guard nodes -- (NGuard) carry a Unique which can be stored in a set (sSet). Guard nodes are -- created during compilation whenever two nodes would lead into one node: -- A->C, B->C. A guard node is added, such that it becomes A->G, B->G, G->C. -- -- Parser guards: Parser guards are more tricky. -- Alt: There are two ways into an Alt node when going up. So, an Alt node -- carries a Unique is stored in sSet and guards upward travel through the -- node. -- FoldGr: There are two ways into a FoldGr node, one going down and one going -- up. But, we can't just a use a Unique to guard entry into it because we -- want to handle loopy cases correctly! A loopy case is where we reach the -- same node in the tree by going up and down the edges without consuming -- input. To detect this, we use a separate Unique (localU) when going down. -- If we find it set when going up, we are looping. When we send up a value, -- looping or not, we guard entry into the node using its (not localU) Unique. -- Many: A Many node is just like FoldlGr, only the looping case is handled -- specially. -- FoldMn: Like FoldGr, there are two ways into a FoldlMn node, one going down -- and one going up, and we must handle loopy cases correctly. A FoldMn sends -- a value up before going down. So, the localU is set when going up and if -- we find it when going down, we are looping. When we send down a value, we -- guard entry into the node using its (not localU) Unique. -- Note [Regex optimizations] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- Currently, the only optimization performed is -- -- * Gather multiple RAlts into a single multi-way branching PAlt/NAlt. It's -- better to multi-way branch at a flat array instead of nested 2-way -- branches, much less pointer-chasing. -- -- Other possible optimizations are possible when compiling, such as removing -- paths going to REmpty. Or even at the RE level by applying laws, such as -- liftA2 f REmpty x = REmpty or liftA2 f (RPure x) y = RFmap (f x) y. -- I don't know yet if this is worth doing.