{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
module Regex.KDE.Compile
(compileRegex)
where
import Data.Word (Word8)
import qualified Data.ByteString as B
import Data.ByteString (ByteString)
import qualified Data.ByteString.UTF8 as U
import Safe
import Data.Attoparsec.ByteString as A hiding (match)
import Data.Char
import Control.Applicative
import Regex.KDE.Regex
import Control.Monad.State.Strict
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup ((<>))
#endif
compileRegex :: Bool -> ByteString -> Either String Regex
compileRegex caseSensitive bs =
let !res = parseOnly (evalStateT parser 0) bs
in res
where
parser = do
!re <- pRegex caseSensitive
(re <$ lift A.endOfInput) <|>
do rest <- lift A.takeByteString
fail $ "parse error at byte position " ++
show (B.length bs - B.length rest)
type RParser = StateT Int Parser
pRegex :: Bool -> RParser Regex
pRegex caseSensitive =
option MatchNull $
foldr MatchAlt
<$> (pAltPart caseSensitive)
<*> (many $ lift (char '|') *> (pAltPart caseSensitive <|> pure mempty))
pAltPart :: Bool -> RParser Regex
pAltPart caseSensitive = mconcat <$> many1 (pRegexPart caseSensitive)
char :: Char -> Parser Char
char c =
c <$ satisfy (== fromIntegral (ord c))
pRegexPart :: Bool -> RParser Regex
pRegexPart caseSensitive =
(lift (pRegexChar caseSensitive) <|> pParenthesized caseSensitive) >>=
lift . pSuffix
pParenthesized :: Bool -> RParser Regex
pParenthesized caseSensitive = (do
_ <- lift (satisfy (== 40))
modifier <- lift (satisfy (== 63) *> pGroupModifiers)
<|> (MatchCapture <$> (modify (+ 1) *> get))
contents <- pRegex caseSensitive
_ <- lift (satisfy (== 41))
return $ modifier contents)
<|> Recurse <$ (lift (string "(?R)" <|> string "(?0)"))
pGroupModifiers :: Parser (Regex -> Regex)
pGroupModifiers =
(id <$ char ':')
<|>
do dir <- option Forward $ Backward <$ char '<'
(AssertPositive dir <$ char '=') <|> (AssertNegative dir <$ char '!')
pSuffix :: Regex -> Parser Regex
pSuffix re = option re $ do
w <- satisfy (\x -> x == 42 || x == 43 || x == 63 || x == 123)
(case w of
42 -> return $ MatchAlt (MatchSome re) MatchNull
43 -> return $ MatchSome re
63 -> return $ MatchAlt re MatchNull
123 -> do
let isDig x = x >= 48 && x < 58
minn <- option Nothing $ readMay . U.toString <$> A.takeWhile isDig
maxn <- option Nothing $ char ',' *>
(readMay . U.toString <$> A.takeWhile isDig)
_ <- char '}'
return $!
case (minn, maxn) of
(Nothing, Nothing) -> atleast 0 re
(Just n, Nothing) -> atleast n re
(Nothing, Just n) -> atmost n re
(Just m, Just n) -> between m n re
_ -> fail "pSuffix encountered impossible byte") >>= pQuantifierModifier
where
atmost 0 _ = MatchNull
atmost n r = MatchAlt (mconcat (replicate n r)) (atmost (n-1) r)
between 0 n r = atmost n r
between m n r = mconcat (replicate m r) <> atmost (n - m) r
atleast n r = mconcat (replicate n r) <> MatchAlt (MatchSome r) MatchNull
pQuantifierModifier :: Regex -> Parser Regex
pQuantifierModifier re = option re $
(Possessive re <$ satisfy (== 43)) <|>
(Lazy re <$ satisfy (==63))
pRegexChar :: Bool -> Parser Regex
pRegexChar caseSensitive = do
w <- satisfy $ const True
case w of
46 -> return MatchAnyChar
37 -> (do
ds <- A.takeWhile1 (\x -> x >= 48 && x <= 57)
case readMay (U.toString ds) of
Just !n -> return $ MatchDynamic n
Nothing -> fail "not a number")
<|> return (MatchChar (== '%'))
92 -> pRegexEscapedChar
36 -> return AssertEnd
94 -> return AssertBeginning
91 -> pRegexCharClass
_ | w < 128
, not (isSpecial w)
-> do let c = chr $ fromIntegral w
return $! MatchChar $
if caseSensitive
then (== c)
else (\d -> toLower d == toLower c)
| w >= 0xc0 -> do
rest <- case w of
_ | w >= 0xf0 -> A.take 3
| w >= 0xe0 -> A.take 2
| otherwise -> A.take 1
case U.uncons (B.cons w rest) of
Just (d, _) -> return $! MatchChar $
if caseSensitive
then (== d)
else (\e -> toLower e == toLower d)
Nothing -> fail "could not decode as UTF8"
| otherwise -> mzero
pRegexEscapedChar :: Parser Regex
pRegexEscapedChar = do
c <- anyChar
(case c of
'b' -> return AssertWordBoundary
'{' -> do
ds <- A.takeWhile1 (\x -> x >= 48 && x <= 57)
_ <- char '}'
case readMay (U.toString ds) of
Just !n -> return $ MatchCaptured $ n
Nothing -> fail "not a number"
'd' -> return $ MatchChar isDigit
'D' -> return $ MatchChar (not . isDigit)
's' -> return $ MatchChar isSpace
'S' -> return $ MatchChar (not . isSpace)
'w' -> return $ MatchChar isWordChar
'W' -> return $ MatchChar (not . isWordChar)
_ | c >= '0' && c <= '9' ->
return $! MatchCaptured (ord c - ord '0')
| otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c)
pEscaped :: Char -> Parser Char
pEscaped c =
case c of
'\\' -> return c
'a' -> return '\a'
'f' -> return '\f'
'n' -> return '\n'
'r' -> return '\r'
't' -> return '\t'
'v' -> return '\v'
'0' -> do
ds <- A.take 3
case readMay ("'\\o" ++ U.toString ds ++ "'") of
Just x -> return x
Nothing -> fail "invalid octal character escape"
'z' -> do
ds <- A.take 4
case readMay ("'\\x" ++ U.toString ds ++ "'") of
Just x -> return x
Nothing -> fail "invalid hex character escape"
_ | c >= '1' && c <= '7' -> do
ds <- A.take 2
case readMay ("'\\o" ++ c : U.toString ds ++ "'") of
Just x -> return x
Nothing -> fail "invalid octal character escape"
| otherwise -> return c
pRegexCharClass :: Parser Regex
pRegexCharClass = do
negated <- option False $ True <$ satisfy (== 94)
let getEscapedClass = do
_ <- satisfy (== 92)
(isDigit <$ char 'd')
<|> (not . isDigit <$ char 'D')
<|> (isSpace <$ char 's')
<|> (not . isSpace <$ char 'S')
<|> (isWordChar <$ char 'w')
<|> (not . isWordChar <$ char 'W')
let getPosixClass = do
_ <- string "[:"
localNegated <- option False $ True <$ satisfy (== 94)
res <- (isAlphaNum <$ string "alnum")
<|> (isAlpha <$ string "alpha")
<|> (isAscii <$ string "ascii")
<|> ((\c -> isSpace c && c `notElem` ['\n','\r','\f','\v']) <$
string "blank")
<|> (isControl <$ string "cntrl")
<|> ((\c -> isPrint c || isSpace c) <$ string "graph:")
<|> (isLower <$ string "lower")
<|> (isUpper <$ string "upper")
<|> (isPrint <$ string "print")
<|> (isPunctuation <$ string "punct")
<|> (isSpace <$ string "space")
<|> ((\c -> isAlphaNum c ||
generalCategory c == ConnectorPunctuation)
<$ string "word:")
<|> (isHexDigit <$ string "xdigit")
_ <- string ":]"
return $! if localNegated then not . res else res
let getC = (satisfy (== 92) *> anyChar >>= pEscaped) <|>
(chr . fromIntegral <$> satisfy (\x -> x /= 92 && x /= 93))
let getCRange = do
c <- getC
(\d -> (\x -> x >= c && x <= d)) <$> (char '-' *> getC) <|>
return (== c)
brack <- option [] $ [(==']')] <$ char ']'
fs <- many (getEscapedClass <|> getPosixClass <|> getCRange)
_ <- satisfy (== 93)
let f c = any ($ c) $ brack ++ fs
return $! MatchChar (if negated then (not . f) else f)
anyChar :: Parser Char
anyChar = do
w <- satisfy (const True)
return $! chr $ fromIntegral w
isSpecial :: Word8 -> Bool
isSpecial 92 = True
isSpecial 63 = True
isSpecial 42 = True
isSpecial 43 = True
isSpecial 123 = True
isSpecial 91 = True
isSpecial 93 = True
isSpecial 37 = True
isSpecial 40 = True
isSpecial 41 = True
isSpecial 124 = True
isSpecial 46 = True
isSpecial 36 = True
isSpecial 94 = True
isSpecial _ = False