#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 706
#endif
#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 707
#endif
module Data.HyperLogLog.Type
(
HyperLogLog(..)
, HasHyperLogLog(..)
, size
, insert
, insertHash
, intersectionSize
, cast
) where
#if __GLASGOW_HASKELL__ < 710
import Control.Applicative
#endif
import Control.Lens
import Control.Monad
import Crypto.MAC.SipHash
import Data.Approximate.Type
import Data.Bits
import Data.Bits.Extras
import qualified Data.Binary as Binary
import Data.Binary
import Data.Bytes.Put (runPutS)
import Data.Bytes.Serial
import Data.HyperLogLog.Config
import Data.Proxy
import Data.Semigroup
import Data.Serialize as Serialize
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector.Unboxed.Mutable as MV
#if __GLASGOW_HASKELL__ < 710
import Data.Word
#endif
import Generics.Deriving hiding (D, to)
import GHC.Int
newtype HyperLogLog p = HyperLogLog { runHyperLogLog :: V.Vector Rank }
deriving (Eq, Show, Generic)
#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 707
type role HyperLogLog nominal
#endif
instance Serialize (HyperLogLog p)
instance Serial (HyperLogLog p) where
serialize (HyperLogLog v) = serialize (V.toList v)
deserialize = liftM (HyperLogLog . V.fromList) deserialize
instance Binary (HyperLogLog p) where
put (HyperLogLog v) = Binary.put (V.toList v)
get = fmap (HyperLogLog . V.fromList) Binary.get
class HasHyperLogLog a p | a -> p where
hyperLogLog :: Lens' a (HyperLogLog p)
instance HasHyperLogLog (HyperLogLog p) p where
hyperLogLog = id
_HyperLogLog :: Iso' (HyperLogLog p) (V.Vector Rank)
_HyperLogLog = iso runHyperLogLog HyperLogLog
instance ReifiesConfig p => HasConfig (HyperLogLog p) where
config = to reflectConfig
instance Semigroup (HyperLogLog p) where
HyperLogLog a <> HyperLogLog b = HyperLogLog (V.zipWith max a b)
instance ReifiesConfig p => Monoid (HyperLogLog p) where
mempty = HyperLogLog $ V.replicate (reflectConfig (Proxy :: Proxy p) ^. numBuckets) 0
mappend = (<>)
sipKey :: SipKey
sipKey = SipKey 4 7
siphash :: Serial a => a -> Word64
siphash a = h
where !bs = runPutS (serialize a)
(SipHash !h) = hash sipKey bs
insert :: (ReifiesConfig s, Serial a) => a -> HyperLogLog s -> HyperLogLog s
insert = insertHash . w32 . siphash
insertHash :: ReifiesConfig s => Word32 -> HyperLogLog s -> HyperLogLog s
insertHash h m@(HyperLogLog v) = HyperLogLog $ V.modify (\x -> do
old <- MV.read x bk
when (rnk > old) $ MV.write x bk rnk
) v where
!bk = calcBucket m h
!rnk = calcRank m h
size :: ReifiesConfig p => HyperLogLog p -> Approximate Int64
size m@(HyperLogLog bs) = Approximate 0.9972 l expected h where
m' = fromIntegral (m^.numBuckets)
numZeros = fromIntegral . V.length . V.filter (== 0) $ bs
res = case raw < m^.smallRange of
True | numZeros > 0 -> m' * log (m' / numZeros)
| otherwise -> raw
False | raw <= m^.interRange -> raw
| otherwise -> 1 * lim32 * log (1 raw / lim32)
raw = m^.rawFact * (1 / sm)
sm = V.sum $ V.map (\x -> 1 / (2 ^^ x)) bs
expected = round res
sd = err (m^.numBits)
err n = 1.04 / sqrt (fromInteger (bit n))
l = floor $ max (res*(13*sd)) 0
h = ceiling $ res*(1+3*sd)
intersectionSize :: ReifiesConfig p => [HyperLogLog p] -> Approximate Int64
intersectionSize [] = 0
intersectionSize (x:xs) = withMin 0 $ size x + intersectionSize xs intersectionSize (mappend x <$> xs)
cast :: forall p q. (ReifiesConfig p, ReifiesConfig q) => HyperLogLog p -> Maybe (HyperLogLog q)
cast old
| newBuckets <= oldBuckets = Just $ over _HyperLogLog ?? mempty $ V.modify $ \m ->
V.forM_ (V.indexed $ old^._HyperLogLog) $ \ (i,o) -> do
let j = mod i newBuckets
a <- MV.read m j
MV.write m j (max o a)
| otherwise = Nothing
where
newConfig = reflectConfig (Proxy :: Proxy q)
newBuckets = newConfig^.numBuckets
oldBuckets = old^.numBuckets