module Data.ByteString.Lazy.Search.KarpRabin (
indicesOfAny
) where
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Unsafe (unsafeIndex)
import qualified Data.IntMap as IM
import Data.Array
import Data.Array.Base (unsafeAt)
import Data.Word (Word8)
import Data.Int (Int64)
import Data.Bits
import Data.List (foldl')
indicesOfAny :: [S.ByteString]
-> L.ByteString
-> [(Int64,[Int])]
indicesOfAny pats
| null nepats = const []
| otherwise = lazyMatcher nepats . L.toChunks
where
nepats = filter (not . S.null) pats
rehash1 :: Int -> Int -> Word8 -> Word8 -> Int
rehash1 out h o n =
(h `shiftL` 1 (fromIntegral o `shiftL` out)) + fromIntegral n
rehash2 :: Int -> Int -> Word8 -> Word8 -> Int
rehash2 out h o n =
(h `shiftL` 2 (fromIntegral o `shiftL` out)) + fromIntegral n
rehash3 :: Int -> Int -> Word8 -> Word8 -> Int
rehash3 out h o n =
(h `shiftL` 3 (fromIntegral o `shiftL` out)) + fromIntegral n
rehash4 :: Int -> Int -> Word8 -> Word8 -> Int
rehash4 out h o n =
(h `shiftL` 4 (fromIntegral o `shiftL` out)) + fromIntegral n
lazyMatcher :: [S.ByteString] -> [S.ByteString] -> [(Int64,[Int])]
lazyMatcher pats = search 0 hLen S.empty
where
!hLen = minimum (32 : map S.length pats)
!shDi = case 32 `quot` hLen of
q | q < 4 -> q
| otherwise -> 4
!outS = shDi*hLen
!patNum = length pats
!patArr = listArray (0, patNum 1) pats
rehash :: Int -> Word8 -> Word8 -> Int
rehash = case shDi of
1 -> rehash1 hLen
2 -> rehash2 outS
3 -> rehash3 outS
_ -> rehash4 outS
hash :: S.ByteString -> Int
hash = S.foldl' (\h w -> (h `shiftL` shDi) + fromIntegral w) 0 . S.take hLen
!hashMap =
foldl' (\mp (h,i) -> IM.insertWith (flip (++)) h [i] mp) IM.empty $
zip (map hash pats) [0 :: Int .. ]
search _ _ _ [] = []
search !h !rm !prev (!str : rest)
| strLen < rm =
let !h' = S.foldl' (\o w -> (o `shiftL` 1) + fromIntegral w) h str
!prev' = S.append prev str
in search h' (rm strLen) prev' rest
| otherwise =
let !h' = S.foldl' (\o w -> (o `shiftL` 1) + fromIntegral w) h
(S.take rm str)
in if S.null prev
then noPast 0 rest str h'
else past 0 rest prev 0 str rm h'
where
!strLen = S.length str
noPast !prior rest !str hsh = go hsh 0
where
!strLen = S.length str
!maxIdx = strLen hLen
strAt !i = unsafeIndex str i
go !h sI =
case IM.lookup h hashMap of
Nothing ->
if sI == maxIdx
then case rest of
[] -> []
(nxt : more) ->
let !h' = rehash h (strAt sI) (unsafeIndex nxt 0)
!prior' = prior + fromIntegral strLen
!prev = S.drop (sI + 1) str
in if hLen == 1
then noPast prior' more nxt h'
else past prior' more prev 0 nxt 1 h'
else go (rehash h (strAt sI) (strAt (sI + hLen))) (sI + 1)
Just ps ->
let !rst = S.drop sI str
!rLen = strLen sI
hd = strAt sI
more =
if sI == maxIdx
then case rest of
[] -> []
(nxt : fut) ->
let !h' = rehash h hd (unsafeIndex nxt 0)
!prior' = prior + fromIntegral strLen
in if hLen == 1
then noPast prior' fut nxt h'
else past prior' fut rst 1 nxt 1 h'
else go (rehash h hd (strAt (sI + hLen))) (sI + 1)
okay bs
| rLen < S.length bs = S.isPrefixOf rst bs &&
checkFut (S.drop rLen bs) rest
| otherwise = S.isPrefixOf bs rst
in case filter (okay . (patArr `unsafeAt`)) ps of
[] -> more
qs -> seq (length qs) $
(prior + fromIntegral sI,qs) : more
past !prior rest !prev !pI !str !sI !hsh
| strLen < 4040 =
let !prior' = prior 1 + fromIntegral (sI hLen)
!curr = S.append (S.drop pI prev) str
in noPast prior' rest curr hsh
| otherwise = go hsh pI sI
where
!strLen = S.length str
strAt !i = unsafeIndex str i
prevAt !i = unsafeIndex prev i
go !h !p !s
| s == hLen = noPast prior rest str h
| otherwise =
case IM.lookup h hashMap of
Nothing ->
let
h' = rehash h (prevAt p) (strAt s)
in go h' (p + 1) (s + 1)
Just ps ->
let !prst = S.drop p prev
more = go (rehash h (prevAt p) (strAt s)) (p + 1) (s + 1)
okay bs = checkFut bs (prst : str : rest)
in case filter (okay . (unsafeAt patArr)) ps of
[] -> more
qs -> seq (length qs) $
(prior + fromIntegral (s hLen), qs) : more
checkFut :: S.ByteString -> [S.ByteString] -> Bool
checkFut _ [] = False
checkFut !bs (!h : t)
| hLen < S.length bs = S.isPrefixOf h bs && checkFut (S.drop hLen bs) t
| otherwise = S.isPrefixOf bs h
where
!hLen = S.length h