{-# LANGUAGE BangPatterns #-} module Regex.Internal.Text ( TextToken , REText , token , satisfy , char , charIgnoreCase , anyChar , oneOf , text , textIgnoreCase , manyText , someText , manyTextMin , someTextMin , manyTextOf , someTextOf , manyTextOfMin , someTextOfMin , naturalDec , integerDec , naturalHex , integerHex , wordRangeDec , intRangeDec , wordRangeHex , intRangeHex , wordDecN , wordHexN , toMatch , withMatch , reParse , ParserText , parse , parseSure , find , findAll , splitOn , replace , replaceAll ) where import Control.Applicative import Data.Char import Data.Foldable (foldl') import Data.Maybe (fromMaybe) import Numeric.Natural import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Array as TArray import qualified Data.Text.Internal as TInternal import qualified Data.Text.Unsafe as TUnsafe import qualified Data.Text.Internal.Encoding.Utf8 as TInternalUtf8 import Data.CharSet (CharSet) import qualified Data.CharSet as CS import Regex.Internal.Parser (Parser) import qualified Regex.Internal.Parser as P import Regex.Internal.Regex (RE(..), Greediness(..), Strictness(..)) import qualified Regex.Internal.Regex as R import qualified Regex.Internal.Num as RNum import qualified Regex.Internal.Generated.CaseFold as CF ---------------------- -- Token and Text REs ---------------------- -- | The token type used for parsing @Text@. -- This module uses RE TextToken for Text regexes instead of simply RE Char to -- support Text slicing. It does mean that use cases not using slicing pay a -- small cost, but it is not worth having two separate Text regex APIs. data TextToken = TextToken { tArr :: {-# UNPACK #-} !TArray.Array , tOffset :: {-# UNPACK #-} !Int , tChar :: {-# UNPACK #-} !Char } -- | A type alias for convenience. -- -- A function which accepts a @RE c a@ will accept a @REText a@. type REText = RE TextToken -- | A type alias for convenience. -- -- A function which accepts a @Parser c a@ will accept a @ParserText a@. type ParserText = Parser TextToken -- | Parse a @Char@ into an @a@ if the given function returns @Just@. token :: (Char -> Maybe a) -> REText a token t = R.token (\ !tok -> t (tChar tok)) {-# INLINE token #-} -- | Parse a @Char@ if it satisfies the given predicate. satisfy :: (Char -> Bool) -> REText Char satisfy p = token $ \c -> if p c then Just c else Nothing {-# INLINE satisfy #-} -- | Parse the given @Char@. char :: Char -> REText Char char !c = satisfy (c==) -- | Parse the given @Char@, ignoring case. -- -- Comparisons are performed after applying -- [simple case folding](https://www.unicode.org/reports/tr44/#Simple_Case_Folding) -- as described by the Unicode standard. charIgnoreCase :: Char -> REText Char charIgnoreCase c = satisfy $ (c'==) . CF.caseFoldSimple where !c' = CF.caseFoldSimple c -- See Note [Why simple case fold] -- | Parse any @Char@. anyChar :: REText Char anyChar = token Just -- | Parse a @Char@ if it is a member of the @CharSet@. oneOf :: CharSet -> REText Char oneOf !cs = satisfy (`CS.member` cs) -- | Parse the given @Text@. text :: Text -> REText Text text t = t <$ T.foldr' ((*>) . char) (pure ()) t -- | Parse the given @Text@, ignoring case. -- -- Comparisons are performed after applying -- [simple case folding](https://www.unicode.org/reports/tr44/#Simple_Case_Folding) -- as described by the Unicode standard. textIgnoreCase :: Text -> REText Text textIgnoreCase t = T.foldr' (\c cs -> R.liftA2' adjacentAppend (ignoreCaseTokenMatch c) cs) (pure T.empty) t -- See Note [Why simple case fold] -- | Parse any @Text@. Biased towards matching more. manyText :: REText Text manyText = R.foldlMany' adjacentAppend T.empty anyTokenMatch -- | Parse any non-empty @Text@. Biased towards matching more. someText :: REText Text someText = R.liftA2' adjacentAppend anyTokenMatch manyText -- | Parse any @Text@. Minimal, i.e. biased towards matching less. manyTextMin :: REText Text manyTextMin = R.foldlManyMin' adjacentAppend T.empty anyTokenMatch -- | Parse any non-empty @Text@. Minimal, i.e. biased towards matching less. someTextMin :: REText Text someTextMin = R.liftA2' adjacentAppend anyTokenMatch manyTextMin -- | Parse any @Text@ containing members of the @CharSet@. -- Biased towards matching more. manyTextOf :: CharSet -> REText Text manyTextOf !cs = R.foldlMany' adjacentAppend T.empty (oneOfTokenMatch cs) -- | Parse any non-empty @Text@ containing members of the @CharSet@. -- Biased towards matching more. someTextOf :: CharSet -> REText Text someTextOf !cs = R.liftA2' adjacentAppend (oneOfTokenMatch cs) (manyTextOf cs) -- | Parse any @Text@ containing members of the @CharSet@. -- Minimal, i.e. biased towards matching less. manyTextOfMin :: CharSet -> REText Text manyTextOfMin !cs = R.foldlManyMin' adjacentAppend T.empty (oneOfTokenMatch cs) -- | Parse any non-empty @Text@ containing members of the @CharSet@. -- Minimal, i.e. biased towards matching less. someTextOfMin :: CharSet -> REText Text someTextOfMin !cs = R.liftA2' adjacentAppend (oneOfTokenMatch cs) (manyTextOfMin cs) ----------------- -- Numeric REs ----------------- -- | Parse a decimal @Natural@. -- Leading zeros are not accepted. Biased towards matching more. naturalDec :: REText Natural naturalDec = RNum.mkNaturalDec digitRange -- | Parse a decimal @Integer@. Parse an optional sign, @\'-\'@ or @\'+\'@, -- followed by the given @RE@, followed by the absolute value of the integer. -- Leading zeros are not accepted. Biased towards matching more. integerDec :: REText a -> REText Integer integerDec sep = RNum.mkSignedInteger minus plus (sep *> naturalDec) -- | Parse a hexadecimal @Natural@. Both uppercase @\'A\'..\'F\'@ and lowercase -- @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. naturalHex :: REText Natural naturalHex = RNum.mkNaturalHex hexDigitRange -- | Parse a hexadecimal @Integer@. Parse an optional sign, @\'-\'@ or @\'+\'@, -- followed by the given @RE@, followed by the absolute value of the integer. -- Both uppercase @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. integerHex :: REText a -> REText Integer integerHex sep = RNum.mkSignedInteger minus plus (sep *> naturalHex) -- | Parse a decimal @Word@ in the range @[low..high]@. -- Leading zeros are not accepted. Biased towards matching more. wordRangeDec :: (Word, Word) -> REText Word wordRangeDec lh = RNum.mkWordRangeDec digitRange lh -- | Parse a decimal @Int@ in the range @[low..high]@. Parse an optional sign, -- @\'-\'@ or @\'+\'@, followed by the given @RE@, followed by the absolute -- value of the integer. -- Leading zeros are not accepted. Biased towards matching more. intRangeDec :: REText a -> (Int, Int) -> REText Int intRangeDec sep lh = RNum.mkSignedIntRange minus plus ((sep *>) . wordRangeDec) lh -- | Parse a hexadecimal @Word@ in the range @[low..high]@. Both uppercase -- @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. wordRangeHex :: (Word, Word) -> REText Word wordRangeHex lh = RNum.mkWordRangeHex hexDigitRange lh -- | Parse a hexadecimal @Int@ in the range @[low..high]@. Parse an optional -- sign, @\'-\'@ or @\'+\'@, followed by the given @RE@, followed by the -- absolute value of the integer. -- Both uppercase @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. intRangeHex :: REText a -> (Int, Int) -> REText Int intRangeHex sep lh = RNum.mkSignedIntRange minus plus ((sep *>) . wordRangeHex) lh -- | Parse a @Word@ of exactly n decimal digits, including any leading zeros. -- Will not parse values that do not fit in a @Word@. -- Biased towards matching more. wordDecN :: Int -> REText Word wordDecN n = RNum.mkWordDecN digitRange n -- | Parse a @Word@ of exactly n hexadecimal digits, including any leading -- zeros. Both uppercase @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are -- accepted. Will not parse values that do not fit in a @Word@. -- Biased towards matching more. wordHexN :: Int -> REText Word wordHexN n = RNum.mkWordHexN hexDigitRange n minus, plus :: REText () minus = token $ \c -> if c == '-' then Just () else Nothing plus = token $ \c -> if c == '+' then Just () else Nothing -- l and h must be in [0..9] digitRange :: Word -> Word -> REText Word digitRange !l !h = token $ \c -> let d = fromIntegral (ord c - ord '0') in if l <= d && d <= h then Just d else Nothing -- l and h must be in [0..15] hexDigitRange :: Word -> Word -> REText Word hexDigitRange !l !h = token $ \c -> let dec = fromIntegral (ord c - ord '0') hexl = fromIntegral (ord c - ord 'a') hexu = fromIntegral (ord c - ord 'A') in do d <- case () of _ | dec <= 9 -> Just dec | hexl <= 5 -> Just $! 10 + hexl | hexu <= 5 -> Just $! 10 + hexu | otherwise -> Nothing if l <= d && d <= h then Just d else Nothing -- TODO: This can surely be optimized ---------------- -- Match stuff ---------------- tokenToSlice :: TextToken -> Text tokenToSlice t = TInternal.Text (tArr t) (tOffset t) (TInternalUtf8.utf8Length (tChar t)) tokenMatch :: (TextToken -> Maybe a) -> REText Text tokenMatch t = R.token (\ !tok -> tokenToSlice tok <$ t tok) tokenWithMatch :: (TextToken -> Maybe a) -> REText (WithMatch a) tokenWithMatch t = R.token (\ !tok -> WM (tokenToSlice tok) <$> t tok) anyTokenMatch :: REText Text anyTokenMatch = R.token (\tok -> Just $! tokenToSlice tok) ignoreCaseTokenMatch :: Char -> REText Text ignoreCaseTokenMatch c = R.token $ \tok -> if CF.caseFoldSimple (tChar tok) == c' then Just $! tokenToSlice tok else Nothing where !c' = CF.caseFoldSimple c oneOfTokenMatch :: CharSet -> REText Text oneOfTokenMatch !cs = R.token $ \tok -> if CS.member (tChar tok) cs then Just $! tokenToSlice tok else Nothing -- | Rebuild the @RE@ such that the result is the matched @Text@ instead. toMatch :: REText a -> REText Text toMatch = go where go :: REText b -> REText Text go re = case re of RToken t -> tokenMatch t RFmap _ _ re1 -> go re1 RFmap_ _ re1 -> go re1 RPure _ -> RPure T.empty RLiftA2 _ _ re1 re2 -> RLiftA2 Strict adjacentAppend (go re1) (go re2) REmpty -> REmpty RAlt re1 re2 -> RAlt (go re1) (go re2) RMany _ _ _ _ re1 -> RFold Strict Greedy adjacentAppend T.empty (go re1) RFold _ gr _ _ re1 -> RFold Strict gr adjacentAppend T.empty (go re1) data WithMatch a = WM {-# UNPACK #-} !Text a instance Functor WithMatch where fmap f (WM t x) = WM t (f x) fmapWM' :: (a -> b) -> WithMatch a -> WithMatch b fmapWM' f (WM t x) = WM t $! f x instance Applicative WithMatch where pure = WM T.empty liftA2 f (WM t1 x) (WM t2 y) = WM (adjacentAppend t1 t2) (f x y) liftA2WM' :: (a1 -> a2 -> b) -> WithMatch a1 -> WithMatch a2 -> WithMatch b liftA2WM' f (WM t1 x) (WM t2 y) = WM (adjacentAppend t1 t2) $! f x y -- | Rebuild the @RE@ to include the matched @Text@ alongside the result. withMatch :: REText a -> REText (Text, a) withMatch = R.fmap' (\(WM t x) -> (t,x)) . go where go :: REText b -> REText (WithMatch b) go re = case re of RToken t -> tokenWithMatch t RFmap st f re1 -> let g = case st of Strict -> fmapWM' f NonStrict -> fmap f in RFmap Strict g (go re1) RFmap_ b re1 -> RFmap Strict (flip WM b) (toMatch re1) RPure b -> RPure (pure b) RLiftA2 st f re1 re2 -> let g = case st of Strict -> liftA2WM' f NonStrict -> liftA2 f in RLiftA2 Strict g (go re1) (go re2) REmpty -> REmpty RAlt re1 re2 -> RAlt (go re1) (go re2) RMany f1 f2 f z re1 -> RMany (fmapWM' f1) (fmapWM' f2) (liftA2WM' f) (pure z) (go re1) RFold st gr f z re1 -> let g = case st of Strict -> liftA2WM' f NonStrict -> liftA2 f in RFold Strict gr g (pure z) (go re1) ---------- -- Parse ---------- tokenFoldr :: (TextToken -> b -> b) -> b -> Text -> b tokenFoldr f z (TInternal.Text a o0 l) = loop o0 where loop o | o - o0 >= l = z loop o = case TUnsafe.iterArray a o of TUnsafe.Iter c clen -> f (TextToken a o c) (loop (o + clen)) {-# INLINE tokenFoldr #-} -- | \(O(mn \log m)\). Parse a @Text@ with a @REText@. -- -- Uses 'Regex.Text.compile', see the note there. -- -- If parsing multiple @Text@s using the same @RE@, it is wasteful to compile -- the @RE@ every time. So, prefer to -- -- * Compile once with 'Regex.Text.compile' or 'Regex.Text.compileBounded' and -- use the compiled 'ParserText' with 'parse' as many times as required. -- * Alternately, partially apply this function to a @RE@ and use the function -- as many times as required. reParse :: REText a -> Text -> Maybe a reParse re = let !p = P.compile re in parse p {-# INLINE reParse #-} -- | \(O(mn \log m)\). Parse a @Text@ with a @ParserText@. parse :: ParserText a -> Text -> Maybe a parse = P.parseFoldr tokenFoldr -- | \(O(mn \log m)\). Parse a @Text@ with a @ParserText@. Calls 'error' on -- parse failure. -- -- For use with parsers that are known to never fail. parseSure :: ParserText a -> Text -> a parseSure p = fromMaybe parseSureError . parse p parseSureError :: a parseSureError = errorWithoutStackTrace "Regex.Text.parseSure: parse failed; if parsing can fail use 'parse' instead" reParseSure :: REText a -> Text -> a reParseSure re = fromMaybe parseSureError . reParse re {-# INLINE reParseSure #-} -- | \(O(mn \log m)\). Find the first occurence of the given @RE@ in a @Text@. -- -- ==== __Examples__ -- -- >>> find (text "meow") "homeowner" -- Just "meow" -- -- To test whether a @Text@ is present in another @Text@, like above, prefer -- @Data.Text.'T.isInfixOf'@. -- -- >>> find (textIgnoreCase "haskell") "Look I'm Haskelling!" -- Just "Haskell" -- >>> find (text "backtracking") "parser-regex" -- Nothing -- find :: REText a -> Text -> Maybe a find = reParse . R.toFind {-# INLINE find #-} -- | \(O(mn \log m)\). Find all non-overlapping occurences of the given @RE@ in -- the @Text@. -- -- ==== __Examples__ -- -- >>> findAll (text "ana") "banananana" -- ["ana","ana"] -- -- @ -- data Roll = Roll -- Natural -- ^ Rolls -- Natural -- ^ Faces on the die -- deriving Show -- -- roll :: REText Roll -- roll = Roll \<$> ('naturalDec' \<|> pure 1) \<* 'char' \'d\' \<*> naturalDec -- @ -- -- >>> findAll roll "3d6, d10, 2d10" -- [Roll 3 6,Roll 1 10,Roll 2 10] -- findAll :: REText a -> Text -> [a] findAll = reParseSure . R.toFindMany {-# INLINE findAll #-} -- | \(O(mn \log m)\). Split a @Text@ at occurences of the given @RE@. -- -- ==== __Examples__ -- -- >>> splitOn (char ' ') "Glasses are really versatile" -- ["Glasses","are","really","versatile"] -- -- For simple splitting, like above, prefer @Data.Text.'Data.Text.words'@, -- @Data.Text.'Data.Text.lines'@, @Data.Text.'Data.Text.split'@ or -- @Data.Text.'Data.Text.splitOn'@, whichever is applicable. -- -- >>> splitOn (char ' ' *> oneOf "+-=" *> char ' ') "3 - 1 + 1/2 - 2 = 0" -- ["3","1","1/2","2","0"] -- -- If the @Text@ starts or ends with a delimiter, the result will contain -- empty @Text@s at those positions. -- -- >>> splitOn (char 'a') "ayaya" -- ["","y","y",""] -- splitOn :: REText a -> Text -> [Text] splitOn = reParseSure . toSplitOn {-# INLINE splitOn #-} toSplitOn :: REText a -> REText [Text] toSplitOn re = manyTextMin `R.sepBy` re -- | \(O(mn \log m)\). Replace the first match of the given @RE@ with its -- result. If there is no match, the result is @Nothing@. -- -- ==== __Examples__ -- -- >>> replace ("world" <$ text "Haskell") "Hello, Haskell!" -- Just "Hello, world!" -- -- >>> replace ("," <$ some (char '.')) "one...two...ten" -- Just "one,two...ten" -- replace :: REText Text -> Text -> Maybe Text replace = reParse . toReplace {-# INLINE replace #-} toReplace :: REText Text -> REText Text toReplace re = liftA2 f manyTextMin re <*> manyText where f a b c = reverseConcat [c,b,a] -- | \(O(mn \log m)\). Replace non-overlapping matches of the given @RE@ with -- their results. -- -- ==== __Examples__ -- -- >>> replaceAll (" and " <$ text ", ") "red, blue, green" -- "red and blue and green" -- -- For simple replacements like above, prefer @Data.Text.'Data.Text.replace'@. -- -- >>> replaceAll ("Fruit" <$ text "Time" <|> "banana" <$ text "arrow") "Time flies like an arrow" -- "Fruit flies like a banana" -- -- @ -- sep = 'oneOf' "-./" -- digits n = 'toMatch' ('replicateM_' n (oneOf 'Data.CharSet.digit')) -- toYmd d m y = mconcat [y, \"-\", m, \"-\", d] -- date = toYmd \<$> digits 2 \<* sep -- \<*> digits 2 \<* sep -- \<*> digits 4 -- @ -- >>> replaceAll date "01/01/1970, 01-04-1990, 03.07.2011" -- "1970-01-01, 1990-04-01, 2011-07-03" -- replaceAll :: REText Text -> Text -> Text replaceAll = reParseSure . toReplaceMany {-# INLINE replaceAll #-} toReplaceMany :: REText Text -> REText Text toReplaceMany re = reverseConcat <$> R.foldlMany' (flip (:)) [] (re <|> anyTokenMatch) ------------------------- -- Low level Text stuff ------------------------- -- WARNING: If t1 and t2 are not empty, they must be adjacent slices of the -- same Text. In other words, sameByteArray# a1 _a2 && o1 + l1 == _o2. adjacentAppend :: Text -> Text -> Text adjacentAppend t1@(TInternal.Text a1 o1 l1) t2@(TInternal.Text _a2 _o2 l2) | T.null t1 = t2 | T.null t2 = t1 | otherwise = TInternal.Text a1 o1 (l1+l2) -- reverseConcat = T.concat . reverse reverseConcat :: [Text] -> Text reverseConcat ts = case ts of [] -> T.empty [t] -> t _ | len == 0 -> T.empty | otherwise -> TInternal.Text arr 0 len where flen acc (TInternal.Text _ _ l) | acc' >= 0 = acc' | otherwise = reverseConcatOverflowError where acc' = acc + l len = foldl' flen 0 ts arr = TArray.run $ do marr <- TArray.new len let loop !_ [] = pure marr loop i (TInternal.Text a o l : ts') = TArray.copyI l marr (i-l) a o *> loop (i-l) ts' loop len ts reverseConcatOverflowError :: a reverseConcatOverflowError = errorWithoutStackTrace "Regex.Text.reverseConcat: size overflow" ---------- -- Notes ---------- -- Note [Why simple case fold] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- Unicode defines two different ways to case fold, "simple" and "full". Full is -- superior to simple, and capable of folding more pairs of texts to the same -- text. This is what is used by Data.Text.toCaseFold. -- -- However, full maps a Char to one or more Chars, for instance "ß" maps to -- "ss". Since we operate on one Char at a time without backtracking, we must -- have branching in our regex corresponding to possible texts that case fold to -- a target text. For instance, to match "sssss" with full case fold given the -- above mapping, possible inputs are -- -- sssss, sssß, ssßs, sßss, ßsss, sßß, ßsß, ßßs -- -- Fun fact: the number of strings that match "s"*n is Fibonacci(n+1). -- Of course, we can't have textIgnoreCase take a text and explode into a regex -- of exponential size. -- -- So, we restrict ourselves to simple case folding. Simple case folding -- maps a single Char to a single Char. And it's easy to test that the required -- Char and a Char in the input case fold to the same Char. -- -- Note that charIgnoreCase could possibly use full case folding. Only a small -- number of texts would case fold to the case fold of a single Char. But we -- stick with simple case fold for consistency.