{-# LANGUAGE BangPatterns, Rank2Types, ScopedTypeVariables, TypeOperators #-}

-- |
-- Module: Data.BloomFilter
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- A fast, space efficient Bloom filter implementation.  A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives.  When an element is added to
--   a filter, a subsequent membership test will definitely return
--   'True'.
--
-- * False positives /are/ possible.  If an element has not been added
--   to a filter, a membership test /may/ nevertheless indicate that
--   the element is present.
--
-- This module provides low-level control.  For an easier to use
-- interface, see the "Data.BloomFilter.Easy" module.

module Data.BloomFilter
    (
    -- * Overview
    -- $overview

    -- ** Ease of use
    -- $ease

    -- ** Performance
    -- $performance

    -- * Types
      Hash
    , Bloom
    , MBloom

    -- * Immutable Bloom filters

    -- ** Conversion
    , freeze
    , thaw
    , unsafeFreeze

    -- ** Creation
    , unfold

    , fromList
    , empty
    , singleton

    -- ** Accessors
    , length
    , elem
    , notElem

    -- ** Modification
    , insert
    , insertList

    -- * The underlying representation
    -- | If you serialize the raw bit arrays below to disk, do not
    -- expect them to be portable to systems with different
    -- conventions for endianness or word size.

    -- | The raw bit array used by the immutable 'Bloom' type.
    , bitArray
    ) where

import Control.Monad (liftM, forM_)
import Control.Monad.ST (ST, runST)
import Control.DeepSeq (NFData(..))
import Data.Array.Base (unsafeAt)
import qualified Data.Array.Base as ST
import Data.Array.Unboxed (UArray)
import Data.Bits ((.&.), unsafeShiftL, unsafeShiftR)
import Data.BloomFilter.Util ((:*)(..))
import qualified Data.BloomFilter.Mutable as MB
import qualified Data.BloomFilter.Mutable.Internal as MB
import Data.BloomFilter.Mutable.Internal (Hash, MBloom)
import Data.Word (Word32)

import Prelude hiding (elem, length, notElem,
                       (/), (*), div, divMod, mod, rem)


-- | An immutable Bloom filter, suitable for querying from pure code.
data Bloom a = B {
      forall a. Bloom a -> a -> [Hash]
hashes :: !(a -> [Hash])
    , forall a. Bloom a -> Int
shift :: {-# UNPACK #-} !Int
    , forall a. Bloom a -> Int
mask :: {-# UNPACK #-} !Int
    , forall a. Bloom a -> UArray Int Hash
bitArray :: {-# UNPACK #-} !(UArray Int Hash)
    }

instance Show (Bloom a) where
    show :: Bloom a -> String
show Bloom a
ub = String
"Bloom { " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> String
show ((Int
1::Int) forall a. Bits a => a -> Int -> a
`unsafeShiftL` forall a. Bloom a -> Int
shift Bloom a
ub) forall a. [a] -> [a] -> [a]
++ String
" bits } "

instance NFData (Bloom a) where
    rnf :: Bloom a -> ()
rnf !Bloom a
_ = ()

logBitsInHash :: Int
logBitsInHash :: Int
logBitsInHash = Int
5 -- Data.BloomFilter.Mutable.logPower2 bitsInHash

-- | Create an immutable Bloom filter, using the given setup function
-- which executes in the 'ST' monad.
--
-- Example:
--
-- @
--import "Data.BloomFilter.Hash" (cheapHashes)
--
--filter = create (cheapHashes 3) 1024 $ \mf -> do
--           insertMB mf \"foo\"
--           insertMB mf \"bar\"
-- @
--
-- Note that the result of the setup function is not used.
create :: (a -> [Hash])        -- ^ family of hash functions to use
        -> Int                  -- ^ number of bits in filter
        -> (forall s. (MBloom s a -> ST s ()))  -- ^ setup function
        -> Bloom a
{-# INLINE create #-}
create :: forall a.
(a -> [Hash])
-> Int -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create a -> [Hash]
hash Int
numBits forall s. MBloom s a -> ST s ()
body = forall a. (forall s. ST s a) -> a
runST forall a b. (a -> b) -> a -> b
$ do
  MBloom s a
mb <- forall a s. (a -> [Hash]) -> Int -> ST s (MBloom s a)
MB.new a -> [Hash]
hash Int
numBits
  forall s. MBloom s a -> ST s ()
body MBloom s a
mb
  forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom s a
mb

-- | Create an immutable Bloom filter from a mutable one.  The mutable
-- filter may be modified afterwards.
freeze :: MBloom s a -> ST s (Bloom a)
freeze :: forall s a. MBloom s a -> ST s (Bloom a)
freeze MBloom s a
mb = forall a. (a -> [Hash]) -> Int -> Int -> UArray Int Hash -> Bloom a
B (forall s a. MBloom s a -> a -> [Hash]
MB.hashes MBloom s a
mb) (forall s a. MBloom s a -> Int
MB.shift MBloom s a
mb) (forall s a. MBloom s a -> Int
MB.mask MBloom s a
mb) forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM`
            forall i (a :: * -> * -> *) e (m :: * -> *) (b :: * -> * -> *).
(Ix i, MArray a e m, IArray b e) =>
a i e -> m (b i e)
ST.freeze (forall s a. MBloom s a -> STUArray s Int Hash
MB.bitArray MBloom s a
mb)

-- | Create an immutable Bloom filter from a mutable one.  The mutable
-- filter /must not/ be modified afterwards, or a runtime crash may
-- occur.  For a safer creation interface, use 'freeze' or 'create'.
unsafeFreeze :: MBloom s a -> ST s (Bloom a)
unsafeFreeze :: forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom s a
mb = forall a. (a -> [Hash]) -> Int -> Int -> UArray Int Hash -> Bloom a
B (forall s a. MBloom s a -> a -> [Hash]
MB.hashes MBloom s a
mb) (forall s a. MBloom s a -> Int
MB.shift MBloom s a
mb) (forall s a. MBloom s a -> Int
MB.mask MBloom s a
mb) forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM`
                    forall i (a :: * -> * -> *) e (m :: * -> *) (b :: * -> * -> *).
(Ix i, MArray a e m, IArray b e) =>
a i e -> m (b i e)
ST.unsafeFreeze (forall s a. MBloom s a -> STUArray s Int Hash
MB.bitArray MBloom s a
mb)

-- | Copy an immutable Bloom filter to create a mutable one.  There is
-- no non-copying equivalent.
thaw :: Bloom a -> ST s (MBloom s a)
thaw :: forall a s. Bloom a -> ST s (MBloom s a)
thaw Bloom a
ub = forall s a.
(a -> [Hash]) -> Int -> Int -> STUArray s Int Hash -> MBloom s a
MB.MB (forall a. Bloom a -> a -> [Hash]
hashes Bloom a
ub) (forall a. Bloom a -> Int
shift Bloom a
ub) (forall a. Bloom a -> Int
mask Bloom a
ub) forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM` forall i (a :: * -> * -> *) e (b :: * -> * -> *) (m :: * -> *).
(Ix i, IArray a e, MArray b e m) =>
a i e -> m (b i e)
ST.thaw (forall a. Bloom a -> UArray Int Hash
bitArray Bloom a
ub)

-- | Create an empty Bloom filter.
--
-- This function is subject to fusion with 'insert'
-- and 'insertList'.
empty :: (a -> [Hash])         -- ^ family of hash functions to use
       -> Int                   -- ^ number of bits in filter
       -> Bloom a
{-# INLINE [1] empty #-}
empty :: forall a. (a -> [Hash]) -> Int -> Bloom a
empty a -> [Hash]
hash Int
numBits = forall a.
(a -> [Hash])
-> Int -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create a -> [Hash]
hash Int
numBits (\MBloom s a
_ -> forall (m :: * -> *) a. Monad m => a -> m a
return ())

-- | Create a Bloom filter with a single element.
--
-- This function is subject to fusion with 'insert'
-- and 'insertList'.
singleton :: (a -> [Hash])     -- ^ family of hash functions to use
           -> Int               -- ^ number of bits in filter
           -> a                 -- ^ element to insert
           -> Bloom a
{-# INLINE [1] singleton #-}
singleton :: forall a. (a -> [Hash]) -> Int -> a -> Bloom a
singleton a -> [Hash]
hash Int
numBits a
elt = forall a.
(a -> [Hash])
-> Int -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create a -> [Hash]
hash Int
numBits (\MBloom s a
mb -> forall s a. MBloom s a -> a -> ST s ()
MB.insert MBloom s a
mb a
elt)

-- | Given a filter's mask and a hash value, compute an offset into
-- a word array and a bit offset within that word.
hashIdx :: Int -> Word32 -> (Int :* Int)
hashIdx :: Int -> Hash -> Int :* Int
hashIdx Int
msk Hash
x = (Int
y forall a. Bits a => a -> Int -> a
`unsafeShiftR` Int
logBitsInHash) forall a b. a -> b -> a :* b
:* (Int
y forall a. Bits a => a -> a -> a
.&. Int
hashMask)
  where hashMask :: Int
hashMask = Int
31 -- bitsInHash - 1
        y :: Int
y = forall a b. (Integral a, Num b) => a -> b
fromIntegral Hash
x forall a. Bits a => a -> a -> a
.&. Int
msk

-- | Hash the given value, returning a list of (word offset, bit
-- offset) pairs, one per hash value.
hashesU :: Bloom a -> a -> [Int :* Int]
hashesU :: forall a. Bloom a -> a -> [Int :* Int]
hashesU Bloom a
ub a
elt = Int -> Hash -> Int :* Int
hashIdx (forall a. Bloom a -> Int
mask Bloom a
ub) forall a b. (a -> b) -> [a] -> [b]
`map` forall a. Bloom a -> a -> [Hash]
hashes Bloom a
ub a
elt

-- | Query an immutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: a -> Bloom a -> Bool
elem :: forall a. a -> Bloom a -> Bool
elem a
elt Bloom a
ub = forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Int :* Int) -> Bool
test (forall a. Bloom a -> a -> [Int :* Int]
hashesU Bloom a
ub a
elt)
  where test :: (Int :* Int) -> Bool
test (Int
off :* Int
bit) = (forall a. Bloom a -> UArray Int Hash
bitArray Bloom a
ub forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> Int -> e
`unsafeAt` Int
off) forall a. Bits a => a -> a -> a
.&. (Hash
1 forall a. Bits a => a -> Int -> a
`unsafeShiftL` Int
bit) forall a. Eq a => a -> a -> Bool
/= Hash
0
          
modify :: (forall s. (MBloom s a -> ST s z))  -- ^ mutation function (result is discarded)
        -> Bloom a
        -> Bloom a
{-# INLINE modify #-}
modify :: forall a z. (forall s. MBloom s a -> ST s z) -> Bloom a -> Bloom a
modify forall s. MBloom s a -> ST s z
body Bloom a
ub = forall a. (forall s. ST s a) -> a
runST forall a b. (a -> b) -> a -> b
$ do
  MBloom s a
mb <- forall a s. Bloom a -> ST s (MBloom s a)
thaw Bloom a
ub
  z
_ <- forall s. MBloom s a -> ST s z
body MBloom s a
mb
  forall s a. MBloom s a -> ST s (Bloom a)
unsafeFreeze MBloom s a
mb

-- | Create a new Bloom filter from an existing one, with the given
-- member added.
--
-- This function may be expensive, as it is likely to cause the
-- underlying bit array to be copied.
--
-- Repeated applications of this function with itself are subject to
-- fusion.
insert :: a -> Bloom a -> Bloom a
{-# NOINLINE insert #-}
insert :: forall a. a -> Bloom a -> Bloom a
insert a
elt = forall a z. (forall s. MBloom s a -> ST s z) -> Bloom a -> Bloom a
modify (forall a b c. (a -> b -> c) -> b -> a -> c
flip forall s a. MBloom s a -> a -> ST s ()
MB.insert a
elt)

-- | Create a new Bloom filter from an existing one, with the given
-- members added.
--
-- This function may be expensive, as it is likely to cause the
-- underlying bit array to be copied.
--
-- Repeated applications of this function with itself are subject to
-- fusion.
insertList :: [a] -> Bloom a -> Bloom a
{-# NOINLINE insertList #-}
insertList :: forall a. [a] -> Bloom a -> Bloom a
insertList [a]
elts = forall a z. (forall s. MBloom s a -> ST s z) -> Bloom a -> Bloom a
modify forall a b. (a -> b) -> a -> b
$ \MBloom s a
mb -> forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (forall s a. MBloom s a -> a -> ST s ()
MB.insert MBloom s a
mb) [a]
elts

{-# RULES "Bloom insert . insert" forall a b u.
    insert b (insert a u) = insertList [a,b] u
  #-}

{-# RULES "Bloom insertList . insert" forall x xs u.
    insertList xs (insert x u) = insertList (x:xs) u
  #-}

{-# RULES "Bloom insert . insertList" forall x xs u.
    insert x (insertList xs u) = insertList (x:xs) u
  #-}

{-# RULES "Bloom insertList . insertList" forall xs ys u.
    insertList xs (insertList ys u) = insertList (xs++ys) u
  #-}

{-# RULES "Bloom insertList . empty" forall h n xs.
    insertList xs (empty h n) = fromList h n xs
  #-}

{-# RULES "Bloom insertList . singleton" forall h n x xs.
    insertList xs (singleton h n x) = fromList h n (x:xs)
  #-}

-- | Query an immutable Bloom filter for non-membership.  If the value
-- /is/ present, return @False@.  If the value is not present, there
-- is /still/ some possibility that @False@ will be returned.
notElem :: a -> Bloom a -> Bool
notElem :: forall a. a -> Bloom a -> Bool
notElem a
elt Bloom a
ub = forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Int :* Int) -> Bool
test (forall a. Bloom a -> a -> [Int :* Int]
hashesU Bloom a
ub a
elt)
  where test :: (Int :* Int) -> Bool
test (Int
off :* Int
bit) = (forall a. Bloom a -> UArray Int Hash
bitArray Bloom a
ub forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> Int -> e
`unsafeAt` Int
off) forall a. Bits a => a -> a -> a
.&. (Hash
1 forall a. Bits a => a -> Int -> a
`unsafeShiftL` Int
bit) forall a. Eq a => a -> a -> Bool
== Hash
0

-- | Return the size of an immutable Bloom filter, in bits.
length :: Bloom a -> Int
length :: forall a. Bloom a -> Int
length = forall a. Bits a => a -> Int -> a
unsafeShiftL Int
1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Bloom a -> Int
shift

-- | Build an immutable Bloom filter from a seed value.  The seeding
-- function populates the filter as follows.
--
--   * If it returns 'Nothing', it is finished producing values to
--     insert into the filter.
--
--   * If it returns @'Just' (a,b)@, @a@ is added to the filter and
--     @b@ is used as a new seed.
unfold :: forall a b. (a -> [Hash]) -- ^ family of hash functions to use
        -> Int                       -- ^ number of bits in filter
        -> (b -> Maybe (a, b))       -- ^ seeding function
        -> b                         -- ^ initial seed
        -> Bloom a
{-# INLINE unfold #-}
unfold :: forall a b.
(a -> [Hash]) -> Int -> (b -> Maybe (a, b)) -> b -> Bloom a
unfold a -> [Hash]
hs Int
numBits b -> Maybe (a, b)
f b
k = forall a.
(a -> [Hash])
-> Int -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create a -> [Hash]
hs Int
numBits (forall s. b -> MBloom s a -> ST s ()
loop b
k)
  where loop :: forall s. b -> MBloom s a -> ST s ()
        loop :: forall s. b -> MBloom s a -> ST s ()
loop b
j MBloom s a
mb = case b -> Maybe (a, b)
f b
j of
                      Just (a
a, b
j') -> forall s a. MBloom s a -> a -> ST s ()
MB.insert MBloom s a
mb a
a forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> forall s. b -> MBloom s a -> ST s ()
loop b
j' MBloom s a
mb
                      Maybe (a, b)
_ -> forall (m :: * -> *) a. Monad m => a -> m a
return ()

-- | Create an immutable Bloom filter, populating it from a list of
-- values.
--
-- Here is an example that uses the @cheapHashes@ function from the
-- "Data.BloomFilter.Hash" module to create a hash function that
-- returns three hashes.
--
-- @
--import "Data.BloomFilter.Hash" (cheapHashes)
--
--filt = fromList (cheapHashes 3) 1024 [\"foo\", \"bar\", \"quux\"]
-- @
fromList :: (a -> [Hash])      -- ^ family of hash functions to use
          -> Int                -- ^ number of bits in filter
          -> [a]                -- ^ values to populate with
          -> Bloom a
{-# INLINE [1] fromList #-}
fromList :: forall a. (a -> [Hash]) -> Int -> [a] -> Bloom a
fromList a -> [Hash]
hs Int
numBits [a]
list = forall a.
(a -> [Hash])
-> Int -> (forall s. MBloom s a -> ST s ()) -> Bloom a
create a -> [Hash]
hs Int
numBits forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [a]
list forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall s a. MBloom s a -> a -> ST s ()
MB.insert

{-# RULES "Bloom insertList . fromList" forall h n xs ys.
    insertList xs (fromList h n ys) = fromList h n (xs ++ ys)
  #-}

{-
-- This is a simpler definition, but GHC doesn't inline the unfold
-- sensibly.

fromList hashes numBits = unfold hashes numBits convert
  where convert (x:xs) = Just (x, xs)
        convert _      = Nothing
-}

-- $overview
--
-- Each of the functions for creating Bloom filters accepts two parameters:
--
-- * The number of bits that should be used for the filter.  Note that
--   a filter is fixed in size; it cannot be resized after creation.
--
-- * A function that accepts a value, and should return a fixed-size
--   list of hashes of that value.  To keep the false positive rate
--   low, the hashes computes should, as far as possible, be
--   independent.
--
-- By choosing these parameters with care, it is possible to tune for
-- a particular false positive rate.  The @suggestSizing@ function in
-- the "Data.BloomFilter.Easy" module calculates useful estimates for
-- these parameters.

-- $ease
--
-- This module provides immutable interfaces for working with a
-- query-only Bloom filter, and for converting to and from mutable
-- Bloom filters.
--
-- For a higher-level interface that is easy to use, see the
-- 'Data.BloomFilter.Easy' module.

-- $performance
--
-- The implementation has been carefully tuned for high performance
-- and low space consumption.
--
-- For efficiency, the number of bits requested when creating a Bloom
-- filter is rounded up to the nearest power of two.  This lets the
-- implementation use bitwise operations internally, instead of much
-- more expensive multiplication, division, and modulus operations.