{-# OPTIONS_GHC -fno-warn-orphans -fno-warn-unused-binds #-} {-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE FlexibleInstances, BangPatterns #-} {-# LANGUAGE TemplateHaskell, QuasiQuotes #-} {-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE UnicodeSyntax, CPP #-} -- | A usable regular expressions library on top of pcre-light. module Text.Regex.PCRE.Heavy ( -- * Matching (=~) , (≈) , scan , scanO , scanRanges , scanRangesO -- * Replacement , sub , subO , gsub , gsubO -- * Splitting , split , splitO -- * QuasiQuoter , re , mkRegexQQ -- * Types and stuff from pcre-light , Regex , PCREOption , PCRE.compileM -- * Advanced raw stuff , rawMatch , rawSub ) where #if __GLASGOW_HASKELL__ < 709 import Control.Applicative ((<$>)) #endif import Language.Haskell.TH hiding (match) import Language.Haskell.TH.Quote import Language.Haskell.TH.Syntax import qualified Text.Regex.PCRE.Light as PCRE import Text.Regex.PCRE.Light.Base import Data.Maybe (isJust, fromMaybe) import Data.List (unfoldr, mapAccumL) import Data.Stringable import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Internal as BS import System.IO.Unsafe (unsafePerformIO) import Foreign import Debug.Trace substr ∷ BS.ByteString → (Int, Int) → BS.ByteString substr s (f, t) = BS.take (t - f) . BS.drop f $ s behead ∷ [a] → (a, [a]) behead (h:t) = (h, t) behead [] = error "no head to behead" reMatch ∷ Stringable a ⇒ Regex → a → Bool reMatch r s = isJust $ PCRE.match r (toByteString s) [] -- | Checks whether a string matches a regex. -- -- >>> :set -XQuasiQuotes -- >>> "https://unrelenting.technology" =~ [re|^http.*|] -- True (=~) ∷ Stringable a ⇒ a → Regex → Bool (=~) = flip reMatch -- | Same as =~. (≈) ∷ Stringable a ⇒ a → Regex → Bool (≈) = (=~) -- | Does raw PCRE matching (you probably shouldn't use this directly). -- -- >>> :set -XOverloadedStrings -- >>> rawMatch [re|\w{2}|] "a a ab abc ba" 0 [] -- Just [(4,6)] -- >>> rawMatch [re|\w{2}|] "a a ab abc ba" 6 [] -- Just [(7,9)] -- >>> rawMatch [re|(\w)(\w)|] "a a ab abc ba" 0 [] -- Just [(4,6),(4,5),(5,6)] rawMatch ∷ Regex → BS.ByteString → Int → [PCREExecOption] → Maybe [(Int, Int)] rawMatch r@(Regex pcreFp _) s offset opts = unsafePerformIO $ do withForeignPtr pcreFp $ \pcrePtr → do let nCapt = PCRE.captureCount r ovecSize = (nCapt + 1) * 3 ovecBytes = ovecSize * size_of_cint allocaBytes ovecBytes $ \ovec → do let (strFp, off, len) = BS.toForeignPtr s withForeignPtr strFp $ \strPtr → do results ← c_pcre_exec pcrePtr nullPtr (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset) (combineExecOptions opts) ovec (fromIntegral ovecSize) if results < 0 then return Nothing else let loop n o acc = if n == results then return $ Just $ reverse acc else do i ← peekElemOff ovec $! o j ← peekElemOff ovec (o + 1) loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc) in loop 0 0 [] nextMatch ∷ Regex → [PCREExecOption] → BS.ByteString → Int → Maybe ([(Int, Int)], Int) nextMatch r opts str offset = case rawMatch r str offset opts of Nothing → Nothing Just [] → Nothing Just ms → Just (ms, maximum $ map snd ms) -- | Searches the string for all matches of a given regex. -- -- >>> scan [re|\s*entry (\d+) (\w+)\s*&?|] " entry 1 hello &entry 2 hi" -- [(" entry 1 hello &",["1","hello"]),("entry 2 hi",["2","hi"])] -- -- It is lazy! If you only need the first match, just apply 'head' (or -- 'headMay' from the "safe" library) -- no extra work will be performed! -- -- >>> head $ scan [re|\s*entry (\d+) (\w+)\s*&?|] " entry 1 hello &entry 2 hi" -- (" entry 1 hello &",["1","hello"]) scan ∷ (Stringable a) ⇒ Regex → a → [(a, [a])] scan r s = scanO r [] s -- | Exactly like 'scan', but passes runtime options to PCRE. scanO ∷ (Stringable a) ⇒ Regex → [PCREExecOption] → a → [(a, [a])] scanO r opts s = map behead $ map (fromByteString . substr str) <$> unfoldr (nextMatch r opts str) 0 where str = toByteString s -- | Searches the string for all matches of a given regex, like 'scan', but -- returns positions inside of the string. -- -- >>> scanRanges [re|\s*entry (\d+) (\w+)\s*&?|] " entry 1 hello &entry 2 hi" -- [((0,17),[(7,8),(9,14)]),((17,27),[(23,24),(25,27)])] -- -- And just like 'scan', it's lazy. scanRanges ∷ (Stringable a) ⇒ Regex → a → [((Int, Int), [(Int, Int)])] scanRanges r s = scanRangesO r [] s -- | Exactly like 'scanRanges', but passes runtime options to PCRE. scanRangesO ∷ Stringable a ⇒ Regex → [PCREExecOption] → a → [((Int, Int), [(Int, Int)])] scanRangesO r opts s = map behead $ unfoldr (nextMatch r opts str) 0 where str = toByteString s class RegexReplacement a where performReplacement ∷ BS.ByteString → [BS.ByteString] → a → BS.ByteString instance {-# OVERLAPPABLE #-} Stringable a ⇒ RegexReplacement a where performReplacement _ _ to = toByteString to instance Stringable a ⇒ RegexReplacement (a → [a] → a) where performReplacement from groups replacer = toByteString $ replacer (fromByteString from) (map fromByteString groups) instance Stringable a ⇒ RegexReplacement (a → a) where performReplacement from _ replacer = toByteString $ replacer (fromByteString from) instance Stringable a ⇒ RegexReplacement ([a] → a) where performReplacement _ groups replacer = toByteString $ replacer (map fromByteString groups) rawSub ∷ RegexReplacement r ⇒ Regex → r → BS.ByteString → Int → [PCREExecOption] → Maybe (BS.ByteString, Int) rawSub r t s offset opts = case rawMatch r s offset opts of Just ((begin, end):groups) → let replacement = performReplacement (substr s (begin, end)) (map (substr s) groups) t in Just (BS.concat [ substr s (0, begin) , replacement , substr s (end, BS.length s)], begin + BS.length replacement) _ → Nothing -- | Replaces the first occurence of a given regex. -- -- >>> sub [re|thing|] "world" "Hello, thing thing" :: String -- "Hello, world thing" -- -- >>> sub [re|a|] "b" "c" :: String -- "c" -- -- >>> sub [re|bad|] "xxxbad" "this is bad, right?" -- "this is xxxbad, right?" -- -- You can use functions! -- A function of Stringable gets the full match. -- A function of [Stringable] gets the groups. -- A function of Stringable → [Stringable] gets both. -- -- >>> sub [re|%(\d+)(\w+)|] (\(d:w:_) -> "{" ++ d ++ " of " ++ w ++ "}" :: String) "Hello, %20thing" :: String -- "Hello, {20 of thing}" sub ∷ (Stringable a, RegexReplacement r) ⇒ Regex → r → a → a sub r t s = subO r [] t s -- | Exactly like 'sub', but passes runtime options to PCRE. subO ∷ (Stringable a, RegexReplacement r) ⇒ Regex → [PCREExecOption] → r → a → a subO r opts t s = fromMaybe s $ fromByteString <$> fst <$> rawSub r t (toByteString s) 0 opts -- | Replaces all occurences of a given regex. -- -- See 'sub' for more documentation. -- -- >>> gsub [re|thing|] "world" "Hello, thing thing" :: String -- "Hello, world world" -- -- >>> gsub [re||] "" "Hello, world" :: String -- "Hello, world" -- -- https://github.com/myfreeweb/pcre-heavy/issues/2 -- >>> gsub [re|good|] "bad" "goodgoodgood" -- "badbadbad" -- -- >>> gsub [re|bad|] "xxxbad" "this is bad, right? bad" -- "this is xxxbad, right? xxxbad" gsub ∷ (Stringable a, RegexReplacement r) ⇒ Regex → r → a → a gsub r t s = gsubO r [] t s -- | Exactly like 'gsub', but passes runtime options to PCRE. gsubO ∷ (Stringable a, RegexReplacement r) ⇒ Regex → [PCREExecOption] → r → a → a gsubO r opts t s = fromByteString $ loop 0 str where str = toByteString s loop offset acc = case rawSub r t acc offset opts of Just (result, newOffset) → if newOffset == offset then acc else loop newOffset result _ → acc -- | Splits the string using the given regex. -- -- Is lazy. -- -- >>> split [re|%(begin|next|end)%|] "%begin%hello%next%world%end%" -- ["","hello","world",""] -- -- >>> split [re|%(begin|next|end)%|] "" -- [""] split ∷ Stringable a ⇒ Regex → a → [a] split r s = splitO r [] s -- | Exactly like 'split', but passes runtime options to PCRE. splitO ∷ Stringable a ⇒ Regex → [PCREExecOption] → a → [a] splitO r opts s = map fromByteString $ map' (substr str) partRanges where map' f = foldr ((:) . f) [f (lastL, BS.length str)] -- avoiding the snoc operation (lastL, partRanges) = mapAccumL invRange 0 ranges invRange acc (xl, xr) = (xr, (acc, xl)) ranges = map fst $ scanRangesO r opts str str = toByteString s instance Lift PCREOption where -- well, the constructor isn't exported, but at least it implements Read/Show :D lift o = let o' = show o in [| read o' ∷ PCREOption |] quoteExpRegex ∷ [PCREOption] → String → ExpQ quoteExpRegex opts txt = [| PCRE.compile (toByteString (txt ∷ String)) opts |] where !_ = PCRE.compile (toByteString txt) opts -- check at compile time -- | Returns a QuasiQuoter like 're', but with given PCRE options. mkRegexQQ ∷ [PCREOption] → QuasiQuoter mkRegexQQ opts = QuasiQuoter { quoteExp = quoteExpRegex opts , quotePat = undefined , quoteType = undefined , quoteDec = undefined } -- | A QuasiQuoter for regular expressions that does a compile time check. re ∷ QuasiQuoter re = mkRegexQQ [utf8]