{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE MagicHash                #-}

module Data.HashTable.Internal.CacheLine
  ( cacheLineSearch
  , cacheLineSearch2
  , cacheLineSearch3
  , forwardSearch2
  , forwardSearch3
  , isCacheLineAligned
  , advanceByCacheLineSize
  , prefetchRead
  , prefetchWrite
  , bl_abs#
  , sign#
  , mask#
  , mask
  , maskw#
  ) where

import           Control.Monad
import           Control.Monad.ST                 (ST)

import           Data.HashTable.Internal.IntArray (Elem, IntArray)
import qualified Data.HashTable.Internal.IntArray as M

#ifndef NO_C_SEARCH
import           Foreign.C.Types
#else
import           Data.Bits
import           Data.Int
import qualified Data.Vector.Unboxed              as U
import           GHC.Int
#endif

import           Data.HashTable.Internal.Utils
import           GHC.Exts

#if __GLASGOW_HASKELL__ >= 707
import           GHC.Exts                         (isTrue#)
#else
isTrue# :: Bool -> Bool
isTrue# = id
#endif

{-# INLINE prefetchRead  #-}
{-# INLINE prefetchWrite #-}
prefetchRead :: IntArray s -> Int -> ST s ()
prefetchWrite :: IntArray s -> Int -> ST s ()

#ifndef NO_C_SEARCH
foreign import ccall unsafe "line_search"
  c_lineSearch :: Ptr a -> CInt -> CUShort -> IO CInt

foreign import ccall unsafe "line_search_2"
  c_lineSearch_2 :: Ptr a -> CInt -> CUShort -> CUShort -> IO CInt

foreign import ccall unsafe "line_search_3"
  c_lineSearch_3 :: Ptr a -> CInt -> CUShort -> CUShort -> CUShort -> IO CInt

foreign import ccall unsafe "forward_search_2"
  c_forwardSearch_2 :: Ptr a -> CInt -> CInt -> CUShort -> CUShort -> IO CInt

foreign import ccall unsafe "forward_search_3"
  c_forwardSearch_3 :: Ptr a -> CInt -> CInt -> CUShort -> CUShort -> CUShort
                    -> IO CInt

foreign import ccall unsafe "prefetch_cacheline_read"
  prefetchCacheLine_read :: Ptr a -> CInt -> IO ()

foreign import ccall unsafe "prefetch_cacheline_write"
  prefetchCacheLine_write :: Ptr a -> CInt -> IO ()


fI :: (Num b, Integral a) => a -> b
fI = fromIntegral

prefetchRead a i = unsafeIOToST $ prefetchCacheLine_read v x
  where
    v   = M.toPtr a
    x   = fI i

prefetchWrite a i = unsafeIOToST $ prefetchCacheLine_write v x
  where
    v   = M.toPtr a
    x   = fI i


{-# INLINE forwardSearch2 #-}
forwardSearch2 :: IntArray s -> Int -> Int -> Elem -> Elem -> ST s Int
forwardSearch2 !vec !start !end !x1 !x2 =
    liftM fromEnum $! unsafeIOToST c
  where
    c = c_forwardSearch_2 (M.toPtr vec) (fI start) (fI end) (fI x1) (fI x2)


{-# INLINE forwardSearch3 #-}
forwardSearch3 :: IntArray s -> Int -> Int -> Elem -> Elem -> Elem -> ST s Int
forwardSearch3 !vec !start !end !x1 !x2 !x3 =
    liftM fromEnum $! unsafeIOToST c
  where
    c = c_forwardSearch_3 (M.toPtr vec) (fI start) (fI end)
                          (fI x1) (fI x2) (fI x3)


{-# INLINE lineSearch #-}
lineSearch :: IntArray s -> Int -> Elem -> ST s Int
lineSearch !vec !start !value =
    liftM fromEnum $! unsafeIOToST c
  where
    c = c_lineSearch (M.toPtr vec) (fI start) (fI value)


{-# INLINE lineSearch2 #-}
lineSearch2 :: IntArray s -> Int -> Elem -> Elem -> ST s Int
lineSearch2 !vec !start !x1 !x2 =
    liftM fromEnum $! unsafeIOToST c
  where
    c = c_lineSearch_2 (M.toPtr vec) (fI start) (fI x1) (fI x2)


{-# INLINE lineSearch3 #-}
lineSearch3 :: IntArray s -> Int -> Elem -> Elem -> Elem -> ST s Int
lineSearch3 !vec !start !x1 !x2 !x3 =
    liftM fromEnum $! unsafeIOToST c
  where
    c = c_lineSearch_3 (M.toPtr vec) (fI start) (fI x1) (fI x2) (fI x3)
#endif

{-# INLINE advanceByCacheLineSize #-}
advanceByCacheLineSize :: Int -> Int -> Int
advanceByCacheLineSize !(I# start0#) !(I# vecSize#) = out
  where
    !(I# clm#) = cacheLineIntMask
    !clmm#     = not# (int2Word# clm#)
    !start#    = word2Int# (clmm# `and#` int2Word# start0#)
    !(I# nw#)  = numElemsInCacheLine
    !start'#   = start# +# nw#
    !s#        = sign# (vecSize# -# start'# -# 1#)
    !m#        = not# (int2Word# s#)
    !r#        = int2Word# start'# `and#` m#
    !out       = I# (word2Int# r#)


{-# INLINE isCacheLineAligned #-}
isCacheLineAligned :: Int -> Bool
isCacheLineAligned (I# x#) = isTrue# (r# ==# 0#)
  where
    !(I# m#) = cacheLineIntMask
    !mw#     = int2Word# m#
    !w#      = int2Word# x#
    !r#      = word2Int# (mw# `and#` w#)


{-# INLINE sign# #-}
-- | Returns 0 if x is positive, -1 otherwise
sign# :: Int# -> Int#
sign# !x# = x# `uncheckedIShiftRA#` wordSizeMinus1#
  where
    !(I# wordSizeMinus1#) = wordSize-1


{-# INLINE bl_abs# #-}
-- | Abs of an integer, branchless
bl_abs# :: Int# -> Int#
bl_abs# !x# = word2Int# r#
  where
    !m# = sign# x#
    !r# = (int2Word# (m# +# x#)) `xor#` int2Word# m#


{-# INLINE mask# #-}
-- | Returns 0xfff..fff (aka -1) if a# == b#, 0 otherwise.
mask# :: Int# -> Int# -> Int#
mask# !a# !b# = dest#
  where
    !d#    = a# -# b#
    !r#    = bl_abs# d# -# 1#
    !dest# = sign# r#


{- note: this code should be:

mask# :: Int# -> Int# -> Int#
mask# !a# !b# = let !(I# z#) = fromEnum (a# ==# b#)
                    !q#      = negateInt# z#
                in q#

but GHC doesn't properly optimize this as straight-line code at the moment.

-}


{-# INLINE mask #-}
-- | Returns 0xfff..fff (aka -1) if a == b, 0 otherwise.
mask :: Int -> Int -> Int
mask (I# a#) (I# b#) = I# (mask# a# b#)


{-# INLINE maskw# #-}
maskw# :: Int# -> Int# -> Word#
maskw# !a# !b# = int2Word# (mask# a# b#)


#ifdef NO_C_SEARCH
prefetchRead _ _ = return ()
prefetchWrite _ _ = return ()

{-# INLINE forwardSearch2 #-}
forwardSearch2 :: IntArray s -> Int -> Int -> Elem -> Elem -> ST s Int
forwardSearch2 !vec !start !end !x1 !x2 = go start end False
  where
    go !i !e !b =
      if i >= e then
        if b then return (-1) else go 0 start True
      else do
        h <- M.readArray vec i
        if h == x1 || h == x2
          then return i
          else go (i+1) e b


{-# INLINE forwardSearch3 #-}
forwardSearch3 :: IntArray s -> Int -> Int -> Elem -> Elem -> Elem -> ST s Int
forwardSearch3 !vec !start !end !x1 !x2 !x3 = go start end False
  where
    go !i !e !b =
      if i >= e then
        if b then return (-1) else go 0 start True
      else do
        h <- M.readArray vec i
        if h == x1 || h == x2 || h == x3
          then return i
          else go (i+1) e b


deBruijnBitPositions :: U.Vector Int8
deBruijnBitPositions =
    U.fromList [
          0,   1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
          31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
         ]


{-# INLINE firstBitSet# #-}
-- only works with 32-bit values -- ok for us here
firstBitSet# :: Int# -> Int#
firstBitSet# i# = word2Int# ((or# zeroCase# posw#))
  where
    !zeroCase#   = int2Word# (mask# 0# i#)
    !w#          = int2Word# i#
    !iLowest#    = word2Int# (and# w# (int2Word# (negateInt# i#)))
    !idxW#       = uncheckedShiftRL#
                       (narrow32Word# (timesWord# (int2Word# iLowest#)
                                                  (int2Word# 0x077CB531#)))
                       27#
    !idx         = I# (word2Int# idxW#)
    !(I8# pos8#) = U.unsafeIndex deBruijnBitPositions idx
    !posw#       = int2Word# pos8#

#endif


#ifdef NO_C_SEARCH
lineResult# :: Word#    -- ^ mask
            -> Int      -- ^ start value
            -> Int
lineResult# bitmask# (I# start#) = I# (word2Int# rv#)
  where
    !p#   = firstBitSet# (word2Int# bitmask#)
    !mm#  = maskw# p# (-1#)
    !nmm# = not# mm#
    !rv#  = mm# `or#` (nmm# `and#` (int2Word# (start# +# p#)))
{-# INLINE lineResult# #-}


-- Required: unlike in C search, required that the start index is
-- cache-line-aligned and array has at least 32 elements after the start index
lineSearch :: IntArray s        -- ^ vector to search
           -> Int               -- ^ start index
           -> Elem              -- ^ value to search for
           -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch !vec !start !elem1 = do
    let !(I# v1#) = fromIntegral elem1

    !(I# x1#) <- liftM fromIntegral $ M.readArray vec start
    let !p1# = (and# (maskw# x1# v1#) (int2Word# 0x1#))

    !(I# x2#) <- liftM fromIntegral $ M.readArray vec $! start + 1
    let !p2# = or# p1# (and# (maskw# x2# v1#) (int2Word# 0x2#))

    !(I# x3#) <- liftM fromIntegral $ M.readArray vec $! start + 2
    let !p3# = or# p2# (and# (maskw# x3# v1#) (int2Word# 0x4#))

    !(I# x4#) <- liftM fromIntegral $ M.readArray vec $! start + 3
    let !p4# = or# p3# (and# (maskw# x4# v1#) (int2Word# 0x8#))

    !(I# x5#) <- liftM fromIntegral $ M.readArray vec $! start + 4
    let !p5# = or# p4# (and# (maskw# x5# v1#) (int2Word# 0x10#))

    !(I# x6#) <- liftM fromIntegral $ M.readArray vec $! start + 5
    let !p6# = or# p5# (and# (maskw# x6# v1#) (int2Word# 0x20#))

    !(I# x7#) <- liftM fromIntegral $ M.readArray vec $! start + 6
    let !p7# = or# p6# (and# (maskw# x7# v1#) (int2Word# 0x40#))

    !(I# x8#) <- liftM fromIntegral $ M.readArray vec $! start + 7
    let !p8# = or# p7# (and# (maskw# x8# v1#) (int2Word# 0x80#))

    !(I# x9#) <- liftM fromIntegral $ M.readArray vec $! start + 8
    let !p9# = or# p8# (and# (maskw# x9# v1#) (int2Word# 0x100#))

    !(I# x10#) <- liftM fromIntegral $ M.readArray vec $! start + 9
    let !p10# = or# p9# (and# (maskw# x10# v1#) (int2Word# 0x200#))

    !(I# x11#) <- liftM fromIntegral $ M.readArray vec $! start + 10
    let !p11# = or# p10# (and# (maskw# x11# v1#) (int2Word# 0x400#))

    !(I# x12#) <- liftM fromIntegral $ M.readArray vec $! start + 11
    let !p12# = or# p11# (and# (maskw# x12# v1#) (int2Word# 0x800#))

    !(I# x13#) <- liftM fromIntegral $ M.readArray vec $! start + 12
    let !p13# = or# p12# (and# (maskw# x13# v1#) (int2Word# 0x1000#))

    !(I# x14#) <- liftM fromIntegral $ M.readArray vec $! start + 13
    let !p14# = or# p13# (and# (maskw# x14# v1#) (int2Word# 0x2000#))

    !(I# x15#) <- liftM fromIntegral $ M.readArray vec $! start + 14
    let !p15# = or# p14# (and# (maskw# x15# v1#) (int2Word# 0x4000#))

    !(I# x16#) <- liftM fromIntegral $ M.readArray vec $! start + 15
    let !p16# = or# p15# (and# (maskw# x16# v1#) (int2Word# 0x8000#))

    !(I# x17#) <- liftM fromIntegral $ M.readArray vec $! start + 16
    let !p17# = or# p16# (and# (maskw# x17# v1#) (int2Word# 0x10000#))

    !(I# x18#) <- liftM fromIntegral $ M.readArray vec $! start + 17
    let !p18# = or# p17# (and# (maskw# x18# v1#) (int2Word# 0x20000#))

    !(I# x19#) <- liftM fromIntegral $ M.readArray vec $! start + 18
    let !p19# = or# p18# (and# (maskw# x19# v1#) (int2Word# 0x40000#))

    !(I# x20#) <- liftM fromIntegral $ M.readArray vec $! start + 19
    let !p20# = or# p19# (and# (maskw# x20# v1#) (int2Word# 0x80000#))

    !(I# x21#) <- liftM fromIntegral $ M.readArray vec $! start + 20
    let !p21# = or# p20# (and# (maskw# x21# v1#) (int2Word# 0x100000#))

    !(I# x22#) <- liftM fromIntegral $ M.readArray vec $! start + 21
    let !p22# = or# p21# (and# (maskw# x22# v1#) (int2Word# 0x200000#))

    !(I# x23#) <- liftM fromIntegral $ M.readArray vec $! start + 22
    let !p23# = or# p22# (and# (maskw# x23# v1#) (int2Word# 0x400000#))

    !(I# x24#) <- liftM fromIntegral $ M.readArray vec $! start + 23
    let !p24# = or# p23# (and# (maskw# x24# v1#) (int2Word# 0x800000#))

    !(I# x25#) <- liftM fromIntegral $ M.readArray vec $! start + 24
    let !p25# = or# p24# (and# (maskw# x25# v1#) (int2Word# 0x1000000#))

    !(I# x26#) <- liftM fromIntegral $ M.readArray vec $! start + 25
    let !p26# = or# p25# (and# (maskw# x26# v1#) (int2Word# 0x2000000#))

    !(I# x27#) <- liftM fromIntegral $ M.readArray vec $! start + 26
    let !p27# = or# p26# (and# (maskw# x27# v1#) (int2Word# 0x4000000#))

    !(I# x28#) <- liftM fromIntegral $ M.readArray vec $! start + 27
    let !p28# = or# p27# (and# (maskw# x28# v1#) (int2Word# 0x8000000#))

    !(I# x29#) <- liftM fromIntegral $ M.readArray vec $! start + 28
    let !p29# = or# p28# (and# (maskw# x29# v1#) (int2Word# 0x10000000#))

    !(I# x30#) <- liftM fromIntegral $ M.readArray vec $! start + 29
    let !p30# = or# p29# (and# (maskw# x30# v1#) (int2Word# 0x20000000#))

    !(I# x31#) <- liftM fromIntegral $ M.readArray vec $! start + 30
    let !p31# = or# p30# (and# (maskw# x31# v1#) (int2Word# 0x40000000#))

    !(I# x32#) <- liftM fromIntegral $ M.readArray vec $! start + 31
    let !p32# = or# p31# (and# (maskw# x32# v1#) (int2Word# 0x80000000#))

    return $! lineResult# p32# start


-- Required: unlike in C search, required that the start index is
-- cache-line-aligned and array has at least 32 elements after the start index
lineSearch2 :: IntArray s        -- ^ vector to search
            -> Int               -- ^ start index
            -> Elem              -- ^ value to search for
            -> Elem              -- ^ second value to search for
            -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch2 !vec !start !elem1 !elem2 = do
    let !(I# v1#) = fromIntegral elem1
    let !(I# v2#) = fromIntegral elem2

    !(I# x1#) <- liftM fromIntegral $ M.readArray vec start
    let !p1# = (and# (int2Word# 0x1#)
                     (or# (maskw# x1# v1#) (maskw# x1# v2#)))
    !(I# x2#) <- liftM fromIntegral $ M.readArray vec $! start + 1
    let !p2# = or# p1# (and# (int2Word# 0x2#)
                             (or# (maskw# x2# v1#) (maskw# x2# v2#)))
    !(I# x3#) <- liftM fromIntegral $ M.readArray vec $! start + 2
    let !p3# = or# p2# (and# (int2Word# 0x4#)
                             (or# (maskw# x3# v1#) (maskw# x3# v2#)))
    !(I# x4#) <- liftM fromIntegral $ M.readArray vec $! start + 3
    let !p4# = or# p3# (and# (int2Word# 0x8#)
                             (or# (maskw# x4# v1#) (maskw# x4# v2#)))
    !(I# x5#) <- liftM fromIntegral $ M.readArray vec $! start + 4
    let !p5# = or# p4# (and# (int2Word# 0x10#)
                             (or# (maskw# x5# v1#) (maskw# x5# v2#)))
    !(I# x6#) <- liftM fromIntegral $ M.readArray vec $! start + 5
    let !p6# = or# p5# (and# (int2Word# 0x20#)
                             (or# (maskw# x6# v1#) (maskw# x6# v2#)))
    !(I# x7#) <- liftM fromIntegral $ M.readArray vec $! start + 6
    let !p7# = or# p6# (and# (int2Word# 0x40#)
                             (or# (maskw# x7# v1#) (maskw# x7# v2#)))
    !(I# x8#) <- liftM fromIntegral $ M.readArray vec $! start + 7
    let !p8# = or# p7# (and# (int2Word# 0x80#)
                             (or# (maskw# x8# v1#) (maskw# x8# v2#)))
    !(I# x9#) <- liftM fromIntegral $ M.readArray vec $! start + 8
    let !p9# = or# p8# (and# (int2Word# 0x100#)
                             (or# (maskw# x9# v1#) (maskw# x9# v2#)))
    !(I# x10#) <- liftM fromIntegral $ M.readArray vec $! start + 9
    let !p10# = or# p9# (and# (int2Word# 0x200#)
                             (or# (maskw# x10# v1#) (maskw# x10# v2#)))
    !(I# x11#) <- liftM fromIntegral $ M.readArray vec $! start + 10
    let !p11# = or# p10# (and# (int2Word# 0x400#)
                             (or# (maskw# x11# v1#) (maskw# x11# v2#)))
    !(I# x12#) <- liftM fromIntegral $ M.readArray vec $! start + 11
    let !p12# = or# p11# (and# (int2Word# 0x800#)
                             (or# (maskw# x12# v1#) (maskw# x12# v2#)))
    !(I# x13#) <- liftM fromIntegral $ M.readArray vec $! start + 12
    let !p13# = or# p12# (and# (int2Word# 0x1000#)
                             (or# (maskw# x13# v1#) (maskw# x13# v2#)))
    !(I# x14#) <- liftM fromIntegral $ M.readArray vec $! start + 13
    let !p14# = or# p13# (and# (int2Word# 0x2000#)
                             (or# (maskw# x14# v1#) (maskw# x14# v2#)))
    !(I# x15#) <- liftM fromIntegral $ M.readArray vec $! start + 14
    let !p15# = or# p14# (and# (int2Word# 0x4000#)
                             (or# (maskw# x15# v1#) (maskw# x15# v2#)))
    !(I# x16#) <- liftM fromIntegral $ M.readArray vec $! start + 15
    let !p16# = or# p15# (and# (int2Word# 0x8000#)
                             (or# (maskw# x16# v1#) (maskw# x16# v2#)))
    !(I# x17#) <- liftM fromIntegral $ M.readArray vec $! start + 16
    let !p17# = or# p16# (and# (int2Word# 0x10000#)
                             (or# (maskw# x17# v1#) (maskw# x17# v2#)))
    !(I# x18#) <- liftM fromIntegral $ M.readArray vec $! start + 17
    let !p18# = or# p17# (and# (int2Word# 0x20000#)
                             (or# (maskw# x18# v1#) (maskw# x18# v2#)))
    !(I# x19#) <- liftM fromIntegral $ M.readArray vec $! start + 18
    let !p19# = or# p18# (and# (int2Word# 0x40000#)
                             (or# (maskw# x19# v1#) (maskw# x19# v2#)))
    !(I# x20#) <- liftM fromIntegral $ M.readArray vec $! start + 19
    let !p20# = or# p19# (and# (int2Word# 0x80000#)
                             (or# (maskw# x20# v1#) (maskw# x20# v2#)))
    !(I# x21#) <- liftM fromIntegral $ M.readArray vec $! start + 20
    let !p21# = or# p20# (and# (int2Word# 0x100000#)
                             (or# (maskw# x21# v1#) (maskw# x21# v2#)))
    !(I# x22#) <- liftM fromIntegral $ M.readArray vec $! start + 21
    let !p22# = or# p21# (and# (int2Word# 0x200000#)
                             (or# (maskw# x22# v1#) (maskw# x22# v2#)))
    !(I# x23#) <- liftM fromIntegral $ M.readArray vec $! start + 22
    let !p23# = or# p22# (and# (int2Word# 0x400000#)
                             (or# (maskw# x23# v1#) (maskw# x23# v2#)))
    !(I# x24#) <- liftM fromIntegral $ M.readArray vec $! start + 23
    let !p24# = or# p23# (and# (int2Word# 0x800000#)
                             (or# (maskw# x24# v1#) (maskw# x24# v2#)))
    !(I# x25#) <- liftM fromIntegral $ M.readArray vec $! start + 24
    let !p25# = or# p24# (and# (int2Word# 0x1000000#)
                             (or# (maskw# x25# v1#) (maskw# x25# v2#)))
    !(I# x26#) <- liftM fromIntegral $ M.readArray vec $! start + 25
    let !p26# = or# p25# (and# (int2Word# 0x2000000#)
                             (or# (maskw# x26# v1#) (maskw# x26# v2#)))
    !(I# x27#) <- liftM fromIntegral $ M.readArray vec $! start + 26
    let !p27# = or# p26# (and# (int2Word# 0x4000000#)
                             (or# (maskw# x27# v1#) (maskw# x27# v2#)))
    !(I# x28#) <- liftM fromIntegral $ M.readArray vec $! start + 27
    let !p28# = or# p27# (and# (int2Word# 0x8000000#)
                             (or# (maskw# x28# v1#) (maskw# x28# v2#)))
    !(I# x29#) <- liftM fromIntegral $ M.readArray vec $! start + 28
    let !p29# = or# p28# (and# (int2Word# 0x10000000#)
                             (or# (maskw# x29# v1#) (maskw# x29# v2#)))
    !(I# x30#) <- liftM fromIntegral $ M.readArray vec $! start + 29
    let !p30# = or# p29# (and# (int2Word# 0x20000000#)
                             (or# (maskw# x30# v1#) (maskw# x30# v2#)))
    !(I# x31#) <- liftM fromIntegral $ M.readArray vec $! start + 30
    let !p31# = or# p30# (and# (int2Word# 0x40000000#)
                             (or# (maskw# x31# v1#) (maskw# x31# v2#)))
    !(I# x32#) <- liftM fromIntegral $ M.readArray vec $! start + 31
    let !p32# = or# p31# (and# (int2Word# 0x80000000#)
                             (or# (maskw# x32# v1#) (maskw# x32# v2#)))

    return $! lineResult# p32# start


-- Required: unlike in C search, required that the start index is
-- cache-line-aligned and array has at least 32 elements after the start index
lineSearch3 :: IntArray s        -- ^ vector to search
            -> Int               -- ^ start index
            -> Elem              -- ^ value to search for
            -> Elem              -- ^ second value to search for
            -> Elem              -- ^ third value to search for
            -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch3 !vec !start !elem1 !elem2 !elem3 = do
    let !(I# v1#) = fromIntegral elem1
    let !(I# v2#) = fromIntegral elem2
    let !(I# v3#) = fromIntegral elem3

    !(I# x1#) <- liftM fromIntegral $ M.readArray vec start
    let !p1# = (and# (int2Word# 0x1#)
                     (or# (maskw# x1# v1#)
                          (or# (maskw# x1# v2#) (maskw# x1# v3#))))

    !(I# x2#) <- liftM fromIntegral $ M.readArray vec $! start + 1
    let !p2# = or# p1# (and# (int2Word# 0x2#)
                             (or# (maskw# x2# v1#)
                                  (or# (maskw# x2# v2#) (maskw# x2# v3#))))

    !(I# x3#) <- liftM fromIntegral $ M.readArray vec $! start + 2
    let !p3# = or# p2# (and# (int2Word# 0x4#)
                             (or# (maskw# x3# v1#)
                                  (or# (maskw# x3# v2#) (maskw# x3# v3#))))

    !(I# x4#) <- liftM fromIntegral $ M.readArray vec $! start + 3
    let !p4# = or# p3# (and# (int2Word# 0x8#)
                             (or# (maskw# x4# v1#)
                                  (or# (maskw# x4# v2#) (maskw# x4# v3#))))

    !(I# x5#) <- liftM fromIntegral $ M.readArray vec $! start + 4
    let !p5# = or# p4# (and# (int2Word# 0x10#)
                             (or# (maskw# x5# v1#)
                                  (or# (maskw# x5# v2#) (maskw# x5# v3#))))

    !(I# x6#) <- liftM fromIntegral $ M.readArray vec $! start + 5
    let !p6# = or# p5# (and# (int2Word# 0x20#)
                             (or# (maskw# x6# v1#)
                                  (or# (maskw# x6# v2#) (maskw# x6# v3#))))

    !(I# x7#) <- liftM fromIntegral $ M.readArray vec $! start + 6
    let !p7# = or# p6# (and# (int2Word# 0x40#)
                             (or# (maskw# x7# v1#)
                                  (or# (maskw# x7# v2#) (maskw# x7# v3#))))

    !(I# x8#) <- liftM fromIntegral $ M.readArray vec $! start + 7
    let !p8# = or# p7# (and# (int2Word# 0x80#)
                             (or# (maskw# x8# v1#)
                                  (or# (maskw# x8# v2#) (maskw# x8# v3#))))

    !(I# x9#) <- liftM fromIntegral $ M.readArray vec $! start + 8
    let !p9# = or# p8# (and# (int2Word# 0x100#)
                             (or# (maskw# x9# v1#)
                                  (or# (maskw# x9# v2#) (maskw# x9# v3#))))

    !(I# x10#) <- liftM fromIntegral $ M.readArray vec $! start + 9
    let !p10# = or# p9# (and# (int2Word# 0x200#)
                             (or# (maskw# x10# v1#)
                                  (or# (maskw# x10# v2#) (maskw# x10# v3#))))

    !(I# x11#) <- liftM fromIntegral $ M.readArray vec $! start + 10
    let !p11# = or# p10# (and# (int2Word# 0x400#)
                             (or# (maskw# x11# v1#)
                                  (or# (maskw# x11# v2#) (maskw# x11# v3#))))

    !(I# x12#) <- liftM fromIntegral $ M.readArray vec $! start + 11
    let !p12# = or# p11# (and# (int2Word# 0x800#)
                             (or# (maskw# x12# v1#)
                                  (or# (maskw# x12# v2#) (maskw# x12# v3#))))

    !(I# x13#) <- liftM fromIntegral $ M.readArray vec $! start + 12
    let !p13# = or# p12# (and# (int2Word# 0x1000#)
                             (or# (maskw# x13# v1#)
                                  (or# (maskw# x13# v2#) (maskw# x13# v3#))))

    !(I# x14#) <- liftM fromIntegral $ M.readArray vec $! start + 13
    let !p14# = or# p13# (and# (int2Word# 0x2000#)
                             (or# (maskw# x14# v1#)
                                  (or# (maskw# x14# v2#) (maskw# x14# v3#))))

    !(I# x15#) <- liftM fromIntegral $ M.readArray vec $! start + 14
    let !p15# = or# p14# (and# (int2Word# 0x4000#)
                             (or# (maskw# x15# v1#)
                                  (or# (maskw# x15# v2#) (maskw# x15# v3#))))

    !(I# x16#) <- liftM fromIntegral $ M.readArray vec $! start + 15
    let !p16# = or# p15# (and# (int2Word# 0x8000#)
                             (or# (maskw# x16# v1#)
                                  (or# (maskw# x16# v2#) (maskw# x16# v3#))))

    !(I# x17#) <- liftM fromIntegral $ M.readArray vec $! start + 16
    let !p17# = or# p16# (and# (int2Word# 0x10000#)
                             (or# (maskw# x17# v1#)
                                  (or# (maskw# x17# v2#) (maskw# x17# v3#))))

    !(I# x18#) <- liftM fromIntegral $ M.readArray vec $! start + 17
    let !p18# = or# p17# (and# (int2Word# 0x20000#)
                             (or# (maskw# x18# v1#)
                                  (or# (maskw# x18# v2#) (maskw# x18# v3#))))

    !(I# x19#) <- liftM fromIntegral $ M.readArray vec $! start + 18
    let !p19# = or# p18# (and# (int2Word# 0x40000#)
                             (or# (maskw# x19# v1#)
                                  (or# (maskw# x19# v2#) (maskw# x19# v3#))))

    !(I# x20#) <- liftM fromIntegral $ M.readArray vec $! start + 19
    let !p20# = or# p19# (and# (int2Word# 0x80000#)
                             (or# (maskw# x20# v1#)
                                  (or# (maskw# x20# v2#) (maskw# x20# v3#))))

    !(I# x21#) <- liftM fromIntegral $ M.readArray vec $! start + 20
    let !p21# = or# p20# (and# (int2Word# 0x100000#)
                             (or# (maskw# x21# v1#)
                                  (or# (maskw# x21# v2#) (maskw# x21# v3#))))

    !(I# x22#) <- liftM fromIntegral $ M.readArray vec $! start + 21
    let !p22# = or# p21# (and# (int2Word# 0x200000#)
                             (or# (maskw# x22# v1#)
                                  (or# (maskw# x22# v2#) (maskw# x22# v3#))))

    !(I# x23#) <- liftM fromIntegral $ M.readArray vec $! start + 22
    let !p23# = or# p22# (and# (int2Word# 0x400000#)
                             (or# (maskw# x23# v1#)
                                  (or# (maskw# x23# v2#) (maskw# x23# v3#))))

    !(I# x24#) <- liftM fromIntegral $ M.readArray vec $! start + 23
    let !p24# = or# p23# (and# (int2Word# 0x800000#)
                             (or# (maskw# x24# v1#)
                                  (or# (maskw# x24# v2#) (maskw# x24# v3#))))

    !(I# x25#) <- liftM fromIntegral $ M.readArray vec $! start + 24
    let !p25# = or# p24# (and# (int2Word# 0x1000000#)
                             (or# (maskw# x25# v1#)
                                  (or# (maskw# x25# v2#) (maskw# x25# v3#))))

    !(I# x26#) <- liftM fromIntegral $ M.readArray vec $! start + 25
    let !p26# = or# p25# (and# (int2Word# 0x2000000#)
                             (or# (maskw# x26# v1#)
                                  (or# (maskw# x26# v2#) (maskw# x26# v3#))))

    !(I# x27#) <- liftM fromIntegral $ M.readArray vec $! start + 26
    let !p27# = or# p26# (and# (int2Word# 0x4000000#)
                             (or# (maskw# x27# v1#)
                                  (or# (maskw# x27# v2#) (maskw# x27# v3#))))

    !(I# x28#) <- liftM fromIntegral $ M.readArray vec $! start + 27
    let !p28# = or# p27# (and# (int2Word# 0x8000000#)
                             (or# (maskw# x28# v1#)
                                  (or# (maskw# x28# v2#) (maskw# x28# v3#))))

    !(I# x29#) <- liftM fromIntegral $ M.readArray vec $! start + 28
    let !p29# = or# p28# (and# (int2Word# 0x10000000#)
                             (or# (maskw# x29# v1#)
                                  (or# (maskw# x29# v2#) (maskw# x29# v3#))))

    !(I# x30#) <- liftM fromIntegral $ M.readArray vec $! start + 29
    let !p30# = or# p29# (and# (int2Word# 0x20000000#)
                             (or# (maskw# x30# v1#)
                                  (or# (maskw# x30# v2#) (maskw# x30# v3#))))

    !(I# x31#) <- liftM fromIntegral $ M.readArray vec $! start + 30
    let !p31# = or# p30# (and# (int2Word# 0x40000000#)
                             (or# (maskw# x31# v1#)
                                  (or# (maskw# x31# v2#) (maskw# x31# v3#))))

    !(I# x32#) <- liftM fromIntegral $ M.readArray vec $! start + 31
    let !p32# = or# p31# (and# (int2Word# 0x80000000#)
                             (or# (maskw# x32# v1#)
                                  (or# (maskw# x32# v2#) (maskw# x32# v3#))))

    return $! lineResult# p32# start


------------------------------------------------------------------------------
-- | Search through a mutable vector for a given int value. The number of
-- things to search for must be at most the number of things remaining in the
-- vector.
naiveSearch :: IntArray s       -- ^ vector to search
            -> Int              -- ^ start index
            -> Int              -- ^ number of things to search
            -> Elem             -- ^ value to search for
            -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch !vec !start !nThings !value = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value then return i else go (i+1)
{-# INLINE naiveSearch #-}


------------------------------------------------------------------------------
naiveSearch2 :: IntArray s       -- ^ vector to search
             -> Int              -- ^ start index
             -> Int              -- ^ number of things to search
             -> Elem             -- ^ value to search for
             -> Elem             -- ^ value 2 to search for
             -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch2 !vec !start !nThings !value1 !value2 = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value1 || x == value2 then return i else go (i+1)
{-# INLINE naiveSearch2 #-}


------------------------------------------------------------------------------
naiveSearch3 :: IntArray s       -- ^ vector to search
             -> Int              -- ^ start index
             -> Int              -- ^ number of things to search
             -> Elem             -- ^ value to search for
             -> Elem             -- ^ value 2 to search for
             -> Elem             -- ^ value 3 to search for
             -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch3 !vec !start !nThings !value1 !value2 !value3 = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value1 || x == value2 || x == value3
          then return i
          else go (i+1)
{-# INLINE naiveSearch3 #-}

-- end #if NO_C_SEARCH
#endif

------------------------------------------------------------------------------
-- | Search through a mutable vector for a given int value, cache-line aligned.
-- If the start index is cache-line aligned, and there is more than a
-- cache-line's room between the start index and the end of the vector, we will
-- search the cache-line all at once using an efficient branchless
-- bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch :: IntArray s        -- ^ vector to search
                -> Int               -- ^ start index
                -> Elem              -- ^ value to search for
                -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch !vec !start !value = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numElemsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start


    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch vec start (min st1 st2) value
      else lineSearch vec start value
#else
    lineSearch vec start value
#endif
{-# INLINE cacheLineSearch #-}


------------------------------------------------------------------------------
-- | Search through a mutable vector for one of two given int values,
-- cache-line aligned.  If the start index is cache-line aligned, and there is
-- more than a cache-line's room between the start index and the end of the
-- vector, we will search the cache-line all at once using an efficient
-- branchless bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch2 :: IntArray s        -- ^ vector to search
                 -> Int               -- ^ start index
                 -> Elem              -- ^ value to search for
                 -> Elem              -- ^ value 2 to search for
                 -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch2 !vec !start !value !value2 = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numElemsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start

    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch2 vec start (min st1 st2) value value2
      else lineSearch2 vec start value value2
#else
    lineSearch2 vec start value value2
#endif
{-# INLINE cacheLineSearch2 #-}


------------------------------------------------------------------------------
-- | Search through a mutable vector for one of three given int values,
-- cache-line aligned.  If the start index is cache-line aligned, and there is
-- more than a cache-line's room between the start index and the end of the
-- vector, we will search the cache-line all at once using an efficient
-- branchless bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch3 :: IntArray s        -- ^ vector to search
                 -> Int               -- ^ start index
                 -> Elem              -- ^ value to search for
                 -> Elem              -- ^ value 2 to search for
                 -> Elem              -- ^ value 3 to search for
                 -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch3 !vec !start !value !value2 !value3 = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numElemsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start

    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch3 vec start (min st1 st2) value value2 value3
      else lineSearch3 vec start value value2 value3
#else
    lineSearch3 vec start value value2 value3
#endif
{-# INLINE cacheLineSearch3 #-}