{-# LANGUAGE ForeignFunctionInterface, EmptyDataDecls #-} {-# CFILES src/cbits/match.c #-} -- | Regular expression matching via PCRE through the FFI module MOO.Builtins.Match ( Regexp , MatchResult(..) -- ** Compiling , newRegexp -- ** Matching , match , rmatch ) where import Foreign (Ptr, FunPtr, ForeignPtr, alloca, allocaArray, nullPtr, peek, peekArray, peekByteOff, pokeByteOff, newForeignPtr, mallocForeignPtrBytes, withForeignPtr, (.|.)) import Foreign.C (CString, CInt(..), CULong, peekCString) import Control.Monad (liftM) import Data.Text (Text) import Data.Text.Encoding (encodeUtf8, decodeUtf8) import Data.ByteString (ByteString, useAsCString, useAsCStringLen) import System.IO.Unsafe (unsafePerformIO) import qualified Data.Text as T import qualified Data.ByteString as BS {-# ANN module ("HLint: ignore Avoid lambda" :: String) #-} #include data PCRE data PCREExtra data CharacterTables foreign import ccall unsafe "static pcre.h" pcre_compile :: CString -> CInt -> Ptr CString -> Ptr CInt -> Ptr CharacterTables -> IO (Ptr PCRE) foreign import ccall unsafe "static pcre.h" pcre_study :: Ptr PCRE -> CInt -> Ptr CString -> IO (Ptr PCREExtra) foreign import ccall unsafe "static pcre.h &" pcre_free_study :: FunPtr (Ptr PCREExtra -> IO ()) foreign import ccall unsafe "static pcre.h &" pcre_free :: Ptr (FunPtr (Ptr a -> IO ())) -- This must be /unsafe/ to block other threads while it executes, since it -- relies on being able to modify and use some PCRE global state. foreign import ccall unsafe "static match.h" match_helper :: Ptr PCRE -> Ptr PCREExtra -> CString -> CInt -> CInt -> Ptr CInt -> IO CInt -- This must be /unsafe/ to block other threads while it executes, since it -- relies on being able to modify and use some PCRE global state. foreign import ccall unsafe "static match.h" rmatch_helper :: Ptr PCRE -> Ptr PCREExtra -> CString -> CInt -> CInt -> Ptr CInt -> IO CInt data Regexp = Regexp { pattern :: Text , caseMatters :: Bool , code :: ForeignPtr PCRE , extra :: ForeignPtr PCREExtra } deriving Show instance Eq Regexp where Regexp { pattern = p1, caseMatters = cm1 } == Regexp { pattern = p2, caseMatters = cm2 } = cm1 == cm2 && p1 == p2 data RewriteState = StateBase | StateEsc | StateCsetInit | StateCsetInit2 | StateCset {-# ANN translate ("HLint: ignore Use list literal" :: String) #-} -- | Translate MOO regular expression syntax into PCRE syntax. -- -- Aside from changing % to \ and sundry tweaks we also address an -- incompatibility between MOO %b, %B, %w, %W and PCRE \b, \B, \w, \W -- -- namely, the inclusion of _ in \w and its absence in %w. translate :: Text -> Text translate = T.pack . concat . rewrite . T.unpack where -- wrap entire expression so we can add a callout at the end rewrite s = "(?:" : rewrite' StateBase s rewrite' StateBase ('%':cs) = rewrite' StateEsc cs rewrite' StateBase ('[':cs) = "[" : rewrite' StateCsetInit cs rewrite' StateBase ( c :cs) | c `elem` "\\|(){" = "\\" : [c] : rewrite' StateBase cs | otherwise = [c] : rewrite' StateBase cs rewrite' StateCsetInit ('^':cs) = "^" : rewrite' StateCsetInit2 cs rewrite' StateCsetInit ( c :cs) | c `elem` "\\[" = "\\" : [c] : rewrite' StateCset cs | otherwise = [c] : rewrite' StateCset cs rewrite' StateCsetInit2 (c:cs) | c `elem` "\\[" = "\\" : [c] : rewrite' StateCset cs | otherwise = [c] : rewrite' StateCset cs rewrite' StateCset (']':cs) = "]" : rewrite' StateBase cs rewrite' StateCset ( c :cs) | c `elem` "\\[" = "\\" : [c] : rewrite' StateCset cs | otherwise = [c] : rewrite' StateCset cs -- insert a null-op (comment) to prevent special sequences rewrite' StateEsc ('(':cs) = "((?#)" : rewrite' StateBase cs rewrite' StateEsc ('b':cs) = alt wordBegin wordEnd : rewrite' StateBase cs rewrite' StateEsc ('B':cs) = alt (lookba word word) (lookba nonword nonword) : rewrite' StateBase cs rewrite' StateEsc ('<':cs) = wordBegin : rewrite' StateBase cs rewrite' StateEsc ('>':cs) = wordEnd : rewrite' StateBase cs rewrite' StateEsc ('w':cs) = word : rewrite' StateBase cs rewrite' StateEsc ('W':cs) = nonword : rewrite' StateBase cs rewrite' StateEsc ( c :cs) | c `elem` ['1'..'9'] = "\\" : [c] : "(?#)" : rewrite' StateBase cs | c `elem` "\\^$.[?*+{" = "\\" : [c] : rewrite' StateBase cs | otherwise = [c] : rewrite' StateBase cs -- add callout at end of pattern for rmatch rewrite' state [] = ")(?C)" : rewriteFinal state -- don't let a trailing % get away without a syntax error rewriteFinal StateEsc = "\\" : [] rewriteFinal _ = [] word = "[^\\W_]" nonword = "[\\W_]" alt a b = "(?:" ++ a ++ "|" ++ b ++ ")" lbehind p = "(?<=" ++ p ++ ")" lahead p = "(?=" ++ p ++ ")" lookba b a = lbehind b ++ lahead a wordBegin = alt "^" (lbehind nonword) ++ lahead word wordEnd = lbehind word ++ alt "$" (lahead nonword) -- | Compile a regular expression pattern into a 'Regexp' value, or return an -- error description if the pattern is malformed. The returned 'Int' is a byte -- offset into an internally translated pattern, and thus is probably not very -- useful. newRegexp :: Text -- ^ pattern -> Bool -- ^ case matters? -> Either (String, Int) Regexp newRegexp regexp caseMatters = unsafePerformIO $ useAsCString string $ \pattern -> alloca $ \errorPtr -> alloca $ \errorOffsetPtr -> do code <- pcre_compile pattern options errorPtr errorOffsetPtr nullPtr if code == nullPtr then do error <- peek errorPtr >>= peekCString errorOffset <- peek errorOffsetPtr return $ Left (patchError error, fromIntegral errorOffset) else do extraFP <- mkExtra code setExtraFlags extraFP codeFP <- peek pcre_free >>= flip newForeignPtr code return $ Right Regexp { pattern = regexp , caseMatters = caseMatters , code = codeFP , extra = extraFP } where string = encodeUtf8 (translate regexp) mkExtra code = alloca $ \errorPtr -> do extra <- pcre_study code 0 errorPtr if extra == nullPtr then do extraFP <- mallocForeignPtrBytes #{const sizeof(pcre_extra)} withForeignPtr extraFP $ \extra -> #{poke pcre_extra, flags} extra (0 :: CULong) return extraFP else newForeignPtr pcre_free_study extra setExtraFlags extraFP = withForeignPtr extraFP $ \extra -> do #{poke pcre_extra, match_limit} extra matchLimit #{poke pcre_extra, match_limit_recursion} extra matchLimitRecursion flags <- #{peek pcre_extra, flags} extra #{poke pcre_extra, flags} extra $ flags .|. (0 :: CULong) .|. #{const PCRE_EXTRA_MATCH_LIMIT} .|. #{const PCRE_EXTRA_MATCH_LIMIT_RECURSION} matchLimit = 100000 :: CULong matchLimitRecursion = 5000 :: CULong patchError = concatMap patch where patch '\\' = "%" patch '(' = "%(" patch ')' = "%)" patch c = [c] options = #{const PCRE_UTF8 | PCRE_NO_UTF8_CHECK} -- allow PCRE to optimize .* at beginning of pattern by implicit anchor .|. #{const PCRE_DOTALL} .|. if caseMatters then 0 else #{const PCRE_CASELESS} maxCaptures = 10 ovecLen = maxCaptures * 3 data MatchResult = MatchFailed | MatchAborted | MatchSucceeded [(Int, Int)] deriving Show -- | Perform regular expression matching. match :: Regexp -> Text -> MatchResult match regexp text = unsafePerformIO $ doMatch match_helper regexp text -- | Perform regular expression matching, returning the rightmost match. rmatch :: Regexp -> Text -> MatchResult rmatch regexp text = unsafePerformIO $ doMatch rmatch_helper regexp text doMatch :: (Ptr PCRE -> Ptr PCREExtra -> CString -> CInt -> CInt -> Ptr CInt -> IO CInt) -> Regexp -> Text -> IO MatchResult doMatch helper Regexp { code = codeFP, extra = extraFP } text = withForeignPtr codeFP $ \code -> withForeignPtr extraFP $ \extra -> useAsCStringLen string $ \(cstring, len) -> allocaArray ovecLen $ \ovec -> do rc <- helper code extra cstring (fromIntegral len) options ovec if rc < 0 then case rc of #{const PCRE_ERROR_NOMATCH} -> return MatchFailed _ -> return MatchAborted else mkMatchResult rc ovec subject where string = encodeUtf8 text subject = (string, T.length text) options = #{const PCRE_NO_UTF8_CHECK} mkMatchResult :: CInt -> Ptr CInt -> (ByteString, Int) -> IO MatchResult mkMatchResult rc ovec (subject, subjectCharLen) = (MatchSucceeded . pairs . map (rebase . fromIntegral)) `liftM` peekArray (n * 2) ovec where rc' = fromIntegral rc n = if rc' == 0 || rc' > maxCaptures then maxCaptures else rc' pairs (s:e:rs) = (s, e) : pairs rs pairs [] = [] -- translate UTF-8 byte offset to character offset rebase 0 = 0 rebase i = subjectCharLen - T.length (decodeUtf8 $ BS.drop i subject)