{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} module Regex.KDE.Compile (compileRegex) where import qualified Data.ByteString as B import qualified Data.Text as T import Data.ByteString (ByteString) import Data.Text.Encoding (decodeUtf8With, encodeUtf8) import Data.Text.Encoding.Error (lenientDecode) import Safe import Data.Attoparsec.Text as A hiding (match) import Data.Char import Control.Applicative import Regex.KDE.Regex import Control.Monad import Control.Monad.State.Strict #if !MIN_VERSION_base(4,11,0) import Data.Semigroup ((<>)) #endif -- I believe the Regex engine used in KatePart is Qt's. -- It is described here: https://doc.qt.io/qt-6/qregexp.html -- | Compile a UTF-8 encoded ByteString as a Regex. If the first -- parameter is True, then the Regex will be case sensitive. compileRegex :: Bool -> ByteString -> Either String Regex compileRegex caseSensitive bs = let !res = parseOnly (evalStateT parser RState{ rsCurrentCaptureNumber = 0, rsCaseSensitive = caseSensitive }) (decodeUtf8With lenientDecode bs) in res where parser = do !re <- pRegex (re <$ lift A.endOfInput) <|> do rest <- lift A.takeText fail $ "parse error at byte position " ++ show (B.length bs - B.length (encodeUtf8 rest)) data RState = RState { rsCurrentCaptureNumber :: Int , rsCaseSensitive :: Bool } deriving (Show) type RParser = StateT RState Parser pRegex :: RParser Regex pRegex = option MatchNull $ foldr MatchAlt <$> pAltPart <*> many (lift (char '|') *> (pAltPart <|> pure mempty)) pAltPart :: RParser Regex pAltPart = mconcat <$> many1 pRegexPart pRegexPart :: RParser Regex pRegexPart = pRegexChar <|> pParenthesized >>= pSuffix pParenthesized :: RParser Regex pParenthesized = do _ <- lift (char '(') -- pcrepattern says: A group that starts with (?| resets the capturing -- parentheses numbers in each alternative. resetCaptureNumbers <- option False (True <$ lift (string "?|")) (modifier, stModifier) <- if resetCaptureNumbers then return (id, id) else lift (char '?' *> pGroupModifiers) <|> do modify (\st -> st{ rsCurrentCaptureNumber = rsCurrentCaptureNumber st + 1}) num <- gets rsCurrentCaptureNumber pure (MatchCapture num, id) currentCaptureNumber <- gets rsCurrentCaptureNumber contents <- option MatchNull $ withStateT stModifier $ foldr MatchAlt <$> pAltPart <*> many (lift (char '|') *> ((when resetCaptureNumbers (modify (\st -> st{ rsCurrentCaptureNumber = currentCaptureNumber })) >> pAltPart) <|> pure mempty)) _ <- lift (char ')') return $ modifier contents pGroupModifiers :: Parser (Regex -> Regex, RState -> RState) pGroupModifiers = (do stmod <- pRegexModifier -- (?i: void (char ':') pure (id, stmod)) <|> do dir <- option Forward $ Backward <$ char '<' ((AssertPositive dir, id) <$ char '=') <|> ((AssertNegative dir, id) <$ char '!') <|> do c <- digit return (\_ -> Subroutine (ord c - 48), id) <|> do void $ char 'R' return (\_ -> Subroutine 0, id) pRegexModifier :: Parser (RState -> RState) pRegexModifier = do -- "adlupimnsx-imnsx" -- i = 105 - = 45 ons <- many $ satisfy (inClass "adlupimnsx") offs <- option [] $ char '-' *> many (satisfy (inClass "imnsx")) pure $ \st -> st{ rsCaseSensitive = if 'i' `elem` ons && 'i' `notElem` offs then False else ('i' `elem` offs) || rsCaseSensitive st } pSuffix :: Regex -> RParser Regex pSuffix re = option re $ do w <- lift $ satisfy (inClass "*+?{") (case w of '*' -> return $ MatchAlt (MatchSome re) MatchNull '+' -> return $ MatchSome re '?' -> return $ MatchAlt re MatchNull '{' -> do minn <- lift $ option Nothing $ readMay . T.unpack <$> A.takeWhile isDigit maxn <- lift $ option minn $ char ',' *> (readMay . T.unpack <$> A.takeWhile isDigit) _ <- lift $ char '}' case (minn, maxn) of (Nothing, Nothing) -> mzero (Just n, Nothing) -> return $! atleast n re (Nothing, Just n) -> return $! atmost n re (Just m, Just n) -> return $! between m n re _ -> fail "pSuffix encountered impossible byte") >>= lift . pQuantifierModifier where atmost 0 _ = MatchNull atmost n r = MatchAlt (mconcat (replicate n r)) (atmost (n-1) r) between 0 n r = atmost n r between m n r = mconcat (replicate m r) <> atmost (n - m) r atleast n r = mconcat (replicate n r) <> MatchAlt (MatchSome r) MatchNull pQuantifierModifier :: Regex -> Parser Regex pQuantifierModifier re = option re $ (Possessive re <$ char '+') <|> (Lazy re <$ char '?') pRegexChar :: RParser Regex pRegexChar = do w <- lift anyChar caseSensitive <- gets rsCaseSensitive case w of '.' -> return MatchAnyChar '%' -> (do -- dynamic %1 %2 ds <- lift $ many1 digit case readMay ds of Just !n -> return $ MatchDynamic n Nothing -> fail "not a number") <|> return (MatchChar (== '%')) '\\' -> lift pRegexEscapedChar '$' -> return AssertEnd '^' -> return AssertBeginning '[' -> lift pRegexCharClass _ | isSpecial w -> mzero | otherwise -> return $! MatchChar $ if caseSensitive then (== w) else (\d -> toLower d == toLower w) pRegexEscapedChar :: Parser Regex pRegexEscapedChar = do c <- A.anyChar (case c of 'b' -> return AssertWordBoundary 'B' -> return $ AssertNegative Forward AssertWordBoundary '{' -> do -- captured pattern: \1 \2 \{12} ds <- many1 digit _ <- char '}' case readMay ds of Just !n -> return $ MatchCaptured n Nothing -> fail "not a number" 'd' -> return $ MatchChar isDigit 'D' -> return $ MatchChar (not . isDigit) 's' -> return $ MatchChar isSpace 'S' -> return $ MatchChar (not . isSpace) 'w' -> return $ MatchChar isWordChar 'W' -> return $ MatchChar (not . isWordChar) 'p' -> MatchChar <$> pUnicodeCharClass _ | isDigit c -> return $! MatchCaptured (ord c - ord '0') | otherwise -> mzero) <|> (MatchChar . (==) <$> pEscaped c) pEscaped :: Char -> Parser Char pEscaped c = case c of '\\' -> return c 'a' -> return '\a' 'f' -> return '\f' 'n' -> return '\n' 'r' -> return '\r' 't' -> return '\t' 'v' -> return '\v' '0' -> do -- \0ooo matches octal ooo ds <- A.take 3 case readMay ("'\\o" ++ T.unpack ds ++ "'") of Just x -> return x Nothing -> fail "invalid octal character escape" _ | c >= '1' && c <= '7' -> do -- \123 matches octal 123, \1 matches octal 1 let octalDigitScanner s w | s < 3, isOctDigit w = Just (s + 1) -- digits 0-7 | otherwise = Nothing ds <- A.scan (1 :: Int) octalDigitScanner case readMay ("'\\o" ++ [c] ++ T.unpack ds ++ "'") of Just x -> return x Nothing -> fail "invalid octal character escape" 'z' -> do -- \zhhhh matches unicode hex char hhhh ds <- A.take 4 case readMay ("'\\x" ++ T.unpack ds ++ "'") of Just x -> return x Nothing -> fail "invalid hex character escape" 'x' -> do -- \xhh matches hex hh, \x{h+} matches hex h+ ds <- (char '{' *> A.takeWhile (/= '}') <* char '}') <|> A.take 2 case readMay ("'\\x" ++ T.unpack ds ++ "'") of Just x -> return x Nothing -> fail "invalid hex character escape" _ | isPunctuation c || isSymbol c || isSpace c -> return c | otherwise -> fail $ "invalid escape \\" ++ [c] pRegexCharClass :: Parser Regex pRegexCharClass = do negated <- option False $ True <$ char '^' let getEscapedClass = do _ <- char '\\' (isDigit <$ char 'd') <|> (not . isDigit <$ char 'D') <|> (isSpace <$ char 's') <|> (not . isSpace <$ char 'S') <|> (isWordChar <$ char 'w') <|> (not . isWordChar <$ char 'W') let getPosixClass = do _ <- string "[:" localNegated <- option False $ True <$ char '^' res <- (isAlphaNum <$ string "alnum") <|> (isAlpha <$ string "alpha") <|> (isAscii <$ string "ascii") <|> ((\c -> isSpace c && c `notElem` ['\n','\r','\f','\v']) <$ string "blank") <|> (isControl <$ string "cntrl") <|> ((\c -> isPrint c || isSpace c) <$ string "graph:") <|> (isLower <$ string "lower") <|> (isUpper <$ string "upper") <|> (isPrint <$ string "print") <|> (isPunctuation <$ string "punct") <|> (isSpace <$ string "space") <|> ((\c -> isAlphaNum c || generalCategory c == ConnectorPunctuation) <$ string "word:") <|> (isHexDigit <$ string "xdigit") _ <- string ":]" return $! if localNegated then not . res else res let getC = (char '\\' *> anyChar >>= pEscaped) <|> satisfy (\c -> c /= '\\' && c /= ']') let getCRange = do c <- getC (\d x -> x >= c && x <= d) <$> (char '-' *> getC) <|> return (== c) let getQELiteral = do void $ A.string "\\Q" cs <- manyTill anyChar (A.string "\\E") return $! \c -> any (== c) cs brack <- option [] $ [(==']')] <$ char ']' fs <- many (getQELiteral <|> getEscapedClass <|> getPosixClass <|> getCRange <|> (A.string "\\p" *> pUnicodeCharClass)) void $ char ']' let f c = any ($ c) $ brack ++ fs return $! MatchChar $ if negated then not . f else f -- character class \p{Lo}; we assume \p is already parsed pUnicodeCharClass :: Parser (Char -> Bool) pUnicodeCharClass = do ds <- char '{' *> A.takeWhile (/= '}') <* char '}' return $ (case ds of "Lu" -> (== UppercaseLetter) "Ll" -> (== LowercaseLetter) "Lt" -> (== TitlecaseLetter) "Lm" -> (== ModifierLetter) "Lo" -> (== OtherLetter) "L" -> (\c -> c == UppercaseLetter || c == LowercaseLetter || c == TitlecaseLetter || c == ModifierLetter || c == OtherLetter) "Mn" -> (== NonSpacingMark) "Mc" -> (== SpacingCombiningMark) "Me" -> (== EnclosingMark) "M" -> (\c -> c == NonSpacingMark || c == SpacingCombiningMark || c == EnclosingMark) "Nd" -> (== DecimalNumber) "Nl" -> (== LetterNumber) "No" -> (== OtherNumber) "N" -> (\c -> c == DecimalNumber || c == LetterNumber || c == OtherNumber) "Pc" -> (== ConnectorPunctuation) "Pd" -> (== DashPunctuation) "Ps" -> (== OpenPunctuation) "Pe" -> (== ClosePunctuation) "Pi" -> (== InitialQuote) "Pf" -> (== FinalQuote) "Po" -> (== OtherPunctuation) "P" -> (\c -> c == ConnectorPunctuation || c == DashPunctuation || c == OpenPunctuation || c == ClosePunctuation || c == InitialQuote || c == FinalQuote || c == OtherPunctuation) "Sm" -> (== MathSymbol) "Sc" -> (== CurrencySymbol) "Sk" -> (== ModifierSymbol) "So" -> (== OtherSymbol) "S" -> (\c -> c == MathSymbol || c == CurrencySymbol || c == ModifierSymbol || c == OtherSymbol) "Zs" -> (== Space) "Zl" -> (== LineSeparator) "Zp" -> (== ParagraphSeparator) "Z" -> (\c -> c == Space || c == LineSeparator || c == ParagraphSeparator) "Cc" -> (== Control) "Cf" -> (== Format) "Cs" -> (== Surrogate) "Co" -> (== PrivateUse) "Cn" -> (== NotAssigned) "C" -> (\c -> c == Control || c == Format || c == Surrogate || c == PrivateUse || c == NotAssigned) _ -> const False) . generalCategory isSpecial :: Char -> Bool isSpecial '\\' = True isSpecial '?' = True isSpecial '*' = True isSpecial '+' = True -- isSpecial '{' = True -- this is okay except in suffixes isSpecial '[' = True isSpecial ']' = True isSpecial '%' = True isSpecial '(' = True isSpecial ')' = True isSpecial '|' = True isSpecial '.' = True isSpecial '$' = True isSpecial '^' = True isSpecial _ = False