{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}

-- |
-- Module      : Data.Text.Lazy.Search
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast substring search for lazy 'Text', based on work by Boyer,
-- Moore, Horspool, Sunday, and Lundh.  Adapted from the strict
-- implementation.

module Data.Text.Internal.Lazy.Search
    (
      indices
    ) where

import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word16, Word64)
import qualified Data.Text.Internal as T
import Data.Text.Internal.Fusion.Types (PairS(..))
import Data.Text.Internal.Lazy (Text(..), foldlChunks)
import Data.Bits ((.|.), (.&.))
import Data.Text.Internal.Unsafe.Shift (shiftL)

-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- This function is strict in @needle@, and lazy (as far as possible)
-- in the chunks of @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text              -- ^ Substring to search for (@needle@)
        -> Text              -- ^ Text to search in (@haystack@)
        -> [Int64]
indices :: Text -> Text -> [Int64]
indices needle :: Text
needle@(Chunk Text
n Text
ns) _haystack :: Text
_haystack@(Chunk Text
k Text
ks)
    | Int64
nlen Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
<= Int64
0  = []
    | Int64
nlen Int64 -> Int64 -> Bool
forall a. Eq a => a -> a -> Bool
== Int64
1  = Word16 -> Int64 -> Text -> Text -> [Int64]
indicesOne (Int64 -> Word16
nindex Int64
0) Int64
0 Text
k Text
ks
    | Bool
otherwise  = Text -> Text -> Int64 -> Int64 -> [Int64]
advance Text
k Text
ks Int64
0 Int64
0
  where
    advance :: Text -> Text -> Int64 -> Int64 -> [Int64]
advance x :: Text
x@(T.Text Array
_ Int
_ Int
l) Text
xs = Int64 -> Int64 -> [Int64]
scan
     where
      scan :: Int64 -> Int64 -> [Int64]
scan !Int64
g !Int64
i
         | Int64
i Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
m = case Text
xs of
                      Text
Empty           -> []
                      Chunk y ys      -> Text -> Text -> Int64 -> Int64 -> [Int64]
advance Text
y Text
ys Int64
g (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
-Int64
m)
         | Int64 -> Text -> Text -> Bool
lackingHay (Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
nlen) Text
x Text
xs  = []
         | Word16
c Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
z Bool -> Bool -> Bool
&& Int64 -> Bool
candidateMatch Int64
0  = Int64
g Int64 -> [Int64] -> [Int64]
forall a. a -> [a] -> [a]
: Int64 -> Int64 -> [Int64]
scan (Int64
gInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
nlen) (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
nlen)
         | Bool
otherwise                   = Int64 -> Int64 -> [Int64]
scan (Int64
gInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
delta) (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
delta)
       where
         m :: Int64
m = Int -> Int64
intToInt64 Int
l
         c :: Word16
c = Int64 -> Word16
hindex (Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
nlast)
         delta :: Int64
delta | Bool
nextInPattern = Int64
nlen Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
1
               | Word16
c Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
z        = Int64
skip Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
1
               | Bool
otherwise     = Int64
1
         nextInPattern :: Bool
nextInPattern         = Word64
mask Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.&. Word16 -> Word64
swizzle (Int64 -> Word16
hindex (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
nlen)) Word64 -> Word64 -> Bool
forall a. Eq a => a -> a -> Bool
== Word64
0
         candidateMatch :: Int64 -> Bool
candidateMatch !Int64
j
             | Int64
j Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
nlast               = Bool
True
             | Int64 -> Word16
hindex (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
j) Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
/= Int64 -> Word16
nindex Int64
j = Bool
False
             | Bool
otherwise                = Int64 -> Bool
candidateMatch (Int64
jInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
1)
         hindex :: Int64 -> Word16
hindex                         = Text -> Text -> Int64 -> Word16
index Text
x Text
xs
    nlen :: Int64
nlen      = Text -> Int64
wordLength Text
needle
    nlast :: Int64
nlast     = Int64
nlen Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
1
    nindex :: Int64 -> Word16
nindex    = Text -> Text -> Int64 -> Word16
index Text
n Text
ns
    z :: Word16
z         = (Word16 -> Text -> Word16) -> Word16 -> Text -> Word16
forall a. (a -> Text -> a) -> a -> Text -> a
foldlChunks Word16 -> Text -> Word16
forall p. p -> Text -> Word16
fin Word16
0 Text
needle
        where fin :: p -> Text -> Word16
fin p
_ (T.Text Array
farr Int
foff Int
flen) = Array -> Int -> Word16
A.unsafeIndex Array
farr (Int
foffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
flenInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1)
    (Word64
mask :: Word64) :*: Int64
skip = Text
-> Text -> Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
buildTable Text
n Text
ns Int64
0 Int
0 Word64
0 (Int64
nlenInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
-Int64
2)

    swizzle :: Word16 -> Word64
    swizzle :: Word16 -> Word64
swizzle Word16
w = Word64
1 Word64 -> Int -> Word64
forall a. UnsafeShift a => a -> Int -> a
`shiftL` (Word16 -> Int
word16ToInt Word16
w Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f)

    buildTable :: Text
-> Text -> Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
buildTable (T.Text Array
xarr Int
xoff Int
xlen) Text
xs = Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
go
      where
        go :: Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
go !(Int64
g::Int64) !Int
i !Word64
msk !Int64
skp
            | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
xlast = case Text
xs of
                             Text
Empty      -> (Word64
msk Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.|. Word16 -> Word64
swizzle Word16
z) Word64 -> Int64 -> PairS Word64 Int64
forall a b. a -> b -> PairS a b
:*: Int64
skp
                             Chunk y ys -> Text
-> Text -> Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
buildTable Text
y Text
ys Int64
g Int
0 Word64
msk' Int64
skp'
            | Bool
otherwise = Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
go (Int64
gInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
1) (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1) Word64
msk' Int64
skp'
            where c :: Word16
c                = Array -> Int -> Word16
A.unsafeIndex Array
xarr (Int
xoffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
i)
                  msk' :: Word64
msk'             = Word64
msk Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.|. Word16 -> Word64
swizzle Word16
c
                  skp' :: Int64
skp' | Word16
c Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
z    = Int64
nlen Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
g Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
2
                       | Bool
otherwise = Int64
skp
                  xlast :: Int
xlast = Int
xlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1

    -- | Check whether an attempt to index into the haystack at the
    -- given offset would fail.
    lackingHay :: Int64 -> T.Text -> Text -> Bool
    lackingHay :: Int64 -> Text -> Text -> Bool
lackingHay Int64
q = Int64 -> Text -> Text -> Bool
go Int64
0
      where
        go :: Int64 -> Text -> Text -> Bool
go Int64
p (T.Text Array
_ Int
_ Int
l) Text
ps = Int64
p' Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
< Int64
q Bool -> Bool -> Bool
&& case Text
ps of
                                             Text
Empty      -> Bool
True
                                             Chunk Text
r Text
rs -> Int64 -> Text -> Text -> Bool
go Int64
p' Text
r Text
rs
            where p' :: Int64
p' = Int64
p Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int -> Int64
intToInt64 Int
l
indices Text
_ Text
_ = []

-- | Fast index into a partly unpacked 'Text'.  We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: T.Text -> Text -> Int64 -> Word16
index :: Text -> Text -> Int64 -> Word16
index (T.Text Array
arr Int
off Int
len) Text
xs !Int64
i
    | Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len   = Array -> Int -> Word16
A.unsafeIndex Array
arr (Int
offInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
j)
    | Bool
otherwise = case Text
xs of
                    Text
Empty
                        -- out of bounds, but legal
                        | Int
j Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len  -> Word16
0
                        -- should never happen, due to lackingHay above
                        | Bool
otherwise -> String -> Word16
forall a. String -> a
emptyError String
"index"
                    Chunk Text
c Text
cs -> Text -> Text -> Int64 -> Word16
index Text
c Text
cs (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
-Int -> Int64
intToInt64 Int
len)
    where j :: Int
j = Int64 -> Int
int64ToInt Int64
i

-- | A variant of 'indices' that scans linearly for a single 'Word16'.
indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
indicesOne :: Word16 -> Int64 -> Text -> Text -> [Int64]
indicesOne Word16
c = Int64 -> Text -> Text -> [Int64]
chunk
  where
    chunk :: Int64 -> T.Text -> Text -> [Int64]
    chunk :: Int64 -> Text -> Text -> [Int64]
chunk !Int64
i (T.Text Array
oarr Int
ooff Int
olen) Text
os = Int -> [Int64]
go Int
0
      where
        go :: Int -> [Int64]
go Int
h | Int
h Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
olen = case Text
os of
                             Text
Empty      -> []
                             Chunk Text
y Text
ys -> Int64 -> Text -> Text -> [Int64]
chunk (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int -> Int64
intToInt64 Int
olen) Text
y Text
ys
             | Word16
on Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
c = Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int -> Int64
intToInt64 Int
h Int64 -> [Int64] -> [Int64]
forall a. a -> [a] -> [a]
: Int -> [Int64]
go (Int
hInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
             | Bool
otherwise = Int -> [Int64]
go (Int
hInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
             where on :: Word16
on = Array -> Int -> Word16
A.unsafeIndex Array
oarr (Int
ooffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
h)

-- | The number of 'Word16' values in a 'Text'.
wordLength :: Text -> Int64
wordLength :: Text -> Int64
wordLength = (Int64 -> Text -> Int64) -> Int64 -> Text -> Int64
forall a. (a -> Text -> a) -> a -> Text -> a
foldlChunks Int64 -> Text -> Int64
sumLength Int64
0
  where
    sumLength :: Int64 -> T.Text -> Int64
    sumLength :: Int64 -> Text -> Int64
sumLength Int64
i (T.Text Array
_ Int
_ Int
l) = Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int -> Int64
intToInt64 Int
l

emptyError :: String -> a
emptyError :: String -> a
emptyError String
fun = String -> a
forall a. HasCallStack => String -> a
error (String
"Data.Text.Lazy.Search." String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
fun String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
": empty input")

intToInt64 :: Int -> Int64
intToInt64 :: Int -> Int64
intToInt64 = Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral

int64ToInt :: Int64 -> Int
int64ToInt :: Int64 -> Int
int64ToInt = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

word16ToInt :: Word16 -> Int
word16ToInt :: Word16 -> Int
word16ToInt = Word16 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral