\begin{code}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE MonoLocalBinds #-}
\end{code}
\begin{code}
module Text.RE.ZeInternals.Types.Match
( Match(..)
, noMatch
, emptyMatchArray
, matched
, matchedText
, matchCapture
, matchCaptures
, (!$$)
, captureText
, (!$$?)
, captureTextMaybe
, (!$)
, capture
, (!$?)
, captureMaybe
, RegexFix(..)
, convertMatchText
) where
\end{code}
\begin{code}
import Data.Array
import Data.Bits
import qualified Data.ByteString as BW
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as LBS
import qualified Data.ByteString.UTF8 as B
import Data.Maybe
import qualified Data.Sequence as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as LT
import Data.Typeable
import Data.Word
import Text.RE.ZeInternals.Types.Capture
import Text.RE.ZeInternals.Types.CaptureID
import Text.Regex.Base
import qualified Text.Regex.PCRE as PCRE
import qualified Text.Regex.TDFA as TDFA
infixl 9 !$, !$$
\end{code}
\begin{code}
data Match a =
Match
{ matchSource :: !a
, captureNames :: !CaptureNames
, matchArray :: !(Array CaptureOrdinal (Capture a))
}
deriving (Show,Eq,Typeable)
\end{code}
\begin{code}
noMatch :: a -> Match a
noMatch t = Match t noCaptureNames emptyMatchArray
emptyMatchArray :: Array CaptureOrdinal (Capture a)
emptyMatchArray = listArray (CaptureOrdinal 0,CaptureOrdinal $ -1) []
\end{code}
\begin{code}
instance Functor Match where
fmap f Match{..} =
Match
{ matchSource = f matchSource
, captureNames = captureNames
, matchArray = fmap (fmap f) matchArray
}
\end{code}
\begin{code}
matched :: Match a -> Bool
matched = isJust . matchCapture
matchedText :: Match a -> Maybe a
matchedText = fmap capturedText . matchCapture
matchCapture :: Match a -> Maybe (Capture a)
matchCapture = fmap fst . matchCaptures
matchCaptures :: Match a -> Maybe (Capture a,[Capture a])
matchCaptures Match{..} = case rangeSize (bounds matchArray) == 0 of
True -> Nothing
False -> Just (matchArray!0,drop 1 $ elems matchArray)
(!$$) :: Match a -> CaptureID -> a
(!$$) = flip captureText
captureText :: CaptureID -> Match a -> a
captureText cid mtch = capturedText $ capture cid mtch
(!$$?) :: Match a -> CaptureID -> Maybe a
(!$$?) = flip captureTextMaybe
captureTextMaybe :: CaptureID -> Match a -> Maybe a
captureTextMaybe cid mtch = do
cap <- mtch !$? cid
case hasCaptured cap of
True -> Just $ capturedText cap
False -> Nothing
(!$) :: Match a -> CaptureID -> Capture a
(!$) = flip capture
capture :: CaptureID -> Match a -> Capture a
capture cid mtch = fromMaybe oops $ mtch !$? cid
where
oops = error $ "capture: out of bounds (" ++ show cid ++ ")"
(!$?) :: Match a -> CaptureID -> Maybe (Capture a)
(!$?) = flip captureMaybe
captureMaybe :: CaptureID -> Match a -> Maybe (Capture a)
captureMaybe cid mtch@Match{..} = do
i <- lookupCaptureID cid mtch
cap <- case bounds matchArray `inRange` i of
True -> Just $ matchArray ! i
False -> Nothing
case hasCaptured cap of
True -> Just cap
False -> Nothing
lookupCaptureID :: CaptureID -> Match a -> Maybe CaptureOrdinal
lookupCaptureID cid Match{..} =
either (const Nothing) Just $ findCaptureID cid captureNames
\end{code}
\begin{code}
instance
( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
, RegexLike regex source
, RegexFix regex source
) =>
RegexContext regex source (Match source) where
match r s = convertMatchText r s $ getAllTextSubmatches $ match r s
matchM r s = do
y <- matchM r s
return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}
\begin{code}
convertMatchText :: RegexFix regex source
=> regex
-> source
-> MatchText source
-> Match source
convertMatchText re hay arr =
Match
{ matchSource = hay
, captureNames = noCaptureNames
, matchArray =
ixmap (CaptureOrdinal lo,CaptureOrdinal hi) getCaptureOrdinal $
fmap f arr
}
where
(lo,hi) = bounds arr
f (ndl,(off_,len_)) =
Capture
{ captureSource = hay
, capturedText = ndl
, captureOffset = off
, captureLength = len
}
where
CharRange off len = utf8_correct re hay off_ len_
\end{code}
\begin{code}
data CharRange = CharRange !Int !Int
deriving (Show)
class RegexFix regex source where
utf8_correct :: regex -> source -> Int -> Int -> CharRange
utf8_correct _ _ = CharRange
instance RegexFix TDFA.Regex [Char] where
instance RegexFix TDFA.Regex B.ByteString where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text where
instance RegexFix TDFA.Regex LT.Text where
instance RegexFix TDFA.Regex (S.Seq Char) where
instance RegexFix PCRE.Regex [Char] where
utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char) where
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
True -> error "utf8_correct_bs: index+length out of range"
False -> skip 0 0
where
skip ix di = case compare ix ix0 of
GT -> error "utf8_correct_bs: UTF-8 decoding error"
EQ -> count ix di 0 ln0
LT -> case u8_width $ BW.index bs ix of
Single -> skip (ix+1) di
Double -> skip (ix+2) $ di+1
Triple -> skip (ix+3) $ di+2
Quadruple -> skip (ix+4) $ di+3
count ix di dl c = case compare c 0 of
LT -> error "utf8_correct_bs: length ends inside character"
EQ -> CharRange (ix0-di) (ln0-dl)
GT -> case u8_width $ BW.index bs ix of
Single -> count (ix+1) di dl $ c-1
Double -> count (ix+2) di (dl+1) $ c-2
Triple -> count (ix+3) di (dl+2) $ c-3
Quadruple -> count (ix+4) di (dl+3) $ c-4
data UTF8Size = Single | Double | Triple | Quadruple
deriving (Show)
u8_width :: Word8 -> UTF8Size
u8_width w8 = case w8 .&. 0x80 == 0x00 of
True -> Single
False -> case w8 .&. 0xE0 == 0xC0 of
True -> Double
False -> case w8 .&. 0xF0 == 0xE0 of
True -> Triple
False -> case w8 .&. 0xF8 == 0xF0 of
True -> Quadruple
False -> error "u8_width: UTF-8 decoding error"
\end{code}