{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module: Data.BloomFilter.Easy
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- An easy-to-use Bloom filter interface.

module Data.BloomFilter.Easy
    (
    -- * Easy creation and querying
      Bloom
    , easyList
    , B.elem
    , B.notElem
    , B.length

    -- ** Example: a spell checker
    -- $example

    -- * Useful defaults for creation
    , safeSuggestSizing
    , suggestSizing
    ) where

import Data.BloomFilter (Bloom)
import Data.BloomFilter.Hash (Hashable, cheapHashes)
import Data.BloomFilter.Util (nextPowerOfTwo)
import qualified Data.ByteString as SB
import qualified Data.ByteString.Lazy as LB
import qualified Data.BloomFilter as B

-- | Create a Bloom filter with the given false positive rate and
-- members.  The hash functions used are computed by the @cheapHashes@
-- function from the 'Data.BloomFilter.Hash' module.
easyList :: (Hashable a)
         => Double              -- ^ desired false positive rate (0 < /e/ < 1)
         -> [a]                 -- ^ values to populate with
         -> Bloom a
{-# SPECIALIZE easyList :: Double -> [String] -> Bloom String #-}
{-# SPECIALIZE easyList :: Double -> [LB.ByteString] -> Bloom LB.ByteString #-}
{-# SPECIALIZE easyList :: Double -> [SB.ByteString] -> Bloom SB.ByteString #-}
easyList :: forall a. Hashable a => Double -> [a] -> Bloom a
easyList Double
errRate [a]
xs = forall a. (a -> [Hash]) -> Int -> [a] -> Bloom a
B.fromList (forall a. Hashable a => Int -> a -> [Hash]
cheapHashes Int
numHashes) Int
numBits [a]
xs
    where capacity :: Int
capacity = forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
xs
          (Int
numBits, Int
numHashes)
              | Int
capacity forall a. Ord a => a -> a -> Bool
> Int
0 = Int -> Double -> (Int, Int)
suggestSizing Int
capacity Double
errRate
              | Bool
otherwise    = (Int
1, Int
1)

-- | Suggest a good combination of filter size and number of hash
-- functions for a Bloom filter, based on its expected maximum
-- capacity and a desired false positive rate.
--
-- The false positive rate is the rate at which queries against the
-- filter should return @True@ when an element is not actually
-- present.  It should be a fraction between 0 and 1, so a 1% false
-- positive rate is represented by 0.01.
safeSuggestSizing
    :: Int              -- ^ expected maximum capacity
    -> Double           -- ^ desired false positive rate (0 < /e/ < 1)
    -> Either String (Int, Int)
safeSuggestSizing :: Int -> Double -> Either String (Int, Int)
safeSuggestSizing Int
capacity Double
errRate
    | Int
capacity forall a. Ord a => a -> a -> Bool
<= Int
0                = forall a b. a -> Either a b
Left String
"invalid capacity"
    | Double
errRate forall a. Ord a => a -> a -> Bool
<= Double
0 Bool -> Bool -> Bool
|| Double
errRate forall a. Ord a => a -> a -> Bool
>= Double
1 = forall a b. a -> Either a b
Left String
"invalid error rate"
    | Bool
otherwise =
    let cap :: Double
cap = forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
capacity
        (Double
bits :: Double, Double
hashes :: Double) =
            forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum [((-Double
k) forall a. Num a => a -> a -> a
* Double
cap forall a. Fractional a => a -> a -> a
/ forall a. Floating a => a -> a
log (Double
1 forall a. Num a => a -> a -> a
- (Double
errRate forall a. Floating a => a -> a -> a
** (Double
1 forall a. Fractional a => a -> a -> a
/ Double
k))), Double
k)
                     | Double
k <- [Double
1..Double
100]]
        roundedBits :: Int
roundedBits = Int -> Int
nextPowerOfTwo (forall a b. (RealFrac a, Integral b) => a -> b
ceiling Double
bits)
    in if Int
roundedBits forall a. Ord a => a -> a -> Bool
<= Int
0 Bool -> Bool -> Bool
|| Int -> Bool
maxbitstoolarge Int
roundedBits
       then forall a b. a -> Either a b
Left  String
"capacity too large to represent"
       else forall a b. b -> Either a b
Right (Int
roundedBits, forall a b. (RealFrac a, Integral b) => a -> b
truncate Double
hashes)
  where
    maxbits :: Int
maxbits = Int
0xffffffff
    -- On 32 bit, maxbits is larger than maxBound :: Int, so wraps around
    -- to a negative number; avoid using it in that case.
    maxbitstoolarge :: Int -> Bool
maxbitstoolarge Int
n = if Int
maxbits forall a. Ord a => a -> a -> Bool
> Int
0 then Int
n forall a. Ord a => a -> a -> Bool
> Int
maxbits else Bool
True

-- | Behaves as 'safeSuggestSizing', but calls 'error' if given
-- invalid or out-of-range inputs.
suggestSizing :: Int            -- ^ expected maximum capacity
              -> Double         -- ^ desired false positive rate (0 < /e/ < 1)
              -> (Int, Int)
suggestSizing :: Int -> Double -> (Int, Int)
suggestSizing Int
cap Double
errs = forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either forall {c}. String -> c
fatal forall a. a -> a
id (Int -> Double -> Either String (Int, Int)
safeSuggestSizing Int
cap Double
errs)
  where fatal :: String -> c
fatal = forall a. HasCallStack => String -> a
error forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String
"Data.BloomFilter.Util.suggestSizing: " forall a. [a] -> [a] -> [a]
++)

-- $example
--
-- This example reads a dictionary file containing one word per line,
-- constructs a Bloom filter with a 1% false positive rate, and
-- spellchecks its standard input.  Like the Unix @spell@ command, it
-- prints each word that it does not recognize.
--
-- @
--import Data.BloomFilter.Easy (easyList, elemB)
--
--main = do
--  filt <- ('easyList' 0.01 . words) \`fmap\` readFile \"/usr/share/dict/words\"
--  let check word | 'elemB' word filt = \"\"
--                 | otherwise         = word ++ \"\\n\"
--  interact (concat . map check . lines)
-- @