{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE GeneralizedNewtypeDeriving#-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE MonoLocalBinds #-}
{-# OPTIONS_GHC -fno-cse #-}
{-# OPTIONS_GHC -fno-full-laziness #-}
{-# OPTIONS_GHC -fno-float-in #-}
{-# OPTIONS_GHC -fno-warn-unused-binds #-}
#if __GLASGOW_HASKELL__ >= 706
{-# LANGUAGE PolyKinds #-}
#endif
#if __GLASGOW_HASKELL__ >= 707
{-# LANGUAGE RoleAnnotations #-}
#endif
module Data.HyperLogLog.Type
(
HyperLogLog(..)
, HasHyperLogLog(..)
, size
, insert
, insertHash
, intersectionSize
, cast
#if __GLASGOW_HASKELL__ >= 708
, coerceConfig
#endif
) where
#if __GLASGOW_HASKELL__ < 710
import Control.Applicative
#endif
import Control.DeepSeq (NFData (..))
import Control.Lens
import Control.Monad
import Crypto.MAC.SipHash
import Data.Approximate.Type
import Data.Bits.Extras
import qualified Data.Binary as Binary
import Data.Binary
import Data.Bytes.Put (runPutS)
import Data.Bytes.Serial
import Data.HyperLogLog.Config
import Data.Proxy
import Data.Reflection
import Data.Semigroup
import Data.Serialize as Serialize
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector.Unboxed.Mutable as MV
#if __GLASGOW_HASKELL__ < 710
import Data.Word
#endif
import GHC.Generics hiding (D, to)
import GHC.Int
#if __GLASGOW_HASKELL__ >= 708
import Data.Type.Coercion (Coercion(..))
#endif
newtype HyperLogLog p = HyperLogLog { runHyperLogLog :: V.Vector Rank }
deriving (Eq, Show, Generic, NFData)
#if __GLASGOW_HASKELL__ >= 708
coerceConfig :: forall p q . (Reifies p Integer, Reifies q Integer) => Maybe (Coercion (HyperLogLog p) (HyperLogLog q))
coerceConfig | reflect (Proxy :: Proxy p) == reflect (Proxy :: Proxy q) = Just Coercion
| otherwise = Nothing
#endif
#if __GLASGOW_HASKELL__ >= 707
type role HyperLogLog nominal
#endif
instance Serialize (HyperLogLog p)
instance Serial (HyperLogLog p) where
serialize (HyperLogLog v) = serialize (V.toList v)
deserialize = liftM (HyperLogLog . V.fromList) deserialize
instance Binary (HyperLogLog p) where
put (HyperLogLog v) = Binary.put (V.toList v)
get = fmap (HyperLogLog . V.fromList) Binary.get
class HasHyperLogLog a p | a -> p where
hyperLogLog :: Lens' a (HyperLogLog p)
instance HasHyperLogLog (HyperLogLog p) p where
hyperLogLog = id
_HyperLogLog :: Iso' (HyperLogLog p) (V.Vector Rank)
_HyperLogLog = iso runHyperLogLog HyperLogLog
{-# INLINE _HyperLogLog #-}
instance Semigroup (HyperLogLog p) where
HyperLogLog a <> HyperLogLog b = HyperLogLog (V.zipWith max a b)
{-# INLINE (<>) #-}
instance Reifies p Integer => Monoid (HyperLogLog p) where
mempty = HyperLogLog $ V.replicate (numBuckets (reflect (Proxy :: Proxy p))) 0
{-# INLINE mempty #-}
mappend = (<>)
{-# INLINE mappend #-}
sipKey :: SipKey
sipKey = SipKey 4 7
siphash :: Serial a => a -> Word64
siphash a = h
where !bs = runPutS (serialize a)
(SipHash !h) = hash sipKey bs
{-# INLINE siphash #-}
insert :: (Reifies s Integer, Serial a) => a -> HyperLogLog s -> HyperLogLog s
insert = insertHash . w32 . siphash
{-# INLINE insert #-}
insertHash :: Reifies s Integer => Word32 -> HyperLogLog s -> HyperLogLog s
insertHash h m@(HyperLogLog v) = HyperLogLog $ V.modify (\x -> do
old <- MV.read x bk
when (rnk > old) $ MV.write x bk rnk
) v where
!n = reflect m
!bk = calcBucket n h
!rnk = calcRank n h
{-# INLINE insertHash #-}
size :: Reifies p Integer => HyperLogLog p -> Approximate Int64
size m@(HyperLogLog bs) = Approximate 0.9972 l expected h where
n = reflect m
m' = fromIntegral (numBuckets n)
numZeros = fromIntegral . V.length . V.filter (== 0) $ bs
res = case raw < smallRange n of
True | numZeros > 0 -> m' * log (m' / numZeros)
| otherwise -> raw
False | raw <= interRange -> raw
| otherwise -> raw + (raw / lim32) * raw
raw = rawFact n * (1 / sm)
sm = V.sum $ V.map (\x -> 1 / (2 ^^ x)) bs
expected = round res
sd = 1.04 / sqrt m'
l = floor $ max (res*(1-3*sd)) 0
h = ceiling $ res*(1+3*sd)
{-# INLINE size #-}
#ifdef HERBIE
{-# ANN size "NoHerbie" #-}
#endif
intersectionSize :: Reifies p Integer => [HyperLogLog p] -> Approximate Int64
intersectionSize [] = 0
intersectionSize (x:xs) = withMin 0 $ size x + intersectionSize xs - intersectionSize (mappend x <$> xs)
{-# INLINE intersectionSize #-}
cast :: forall p q. (Reifies p Integer, Reifies q Integer) => HyperLogLog p -> Maybe (HyperLogLog q)
cast old
| newBuckets <= oldBuckets = Just $ over _HyperLogLog ?? mempty $ V.modify $ \m ->
V.forM_ (V.indexed $ old^._HyperLogLog) $ \ (i,o) -> do
let j = mod i newBuckets
a <- MV.read m j
MV.write m j (max o a)
| otherwise = Nothing
where
newBuckets = numBuckets (reflect (Proxy :: Proxy q))
oldBuckets = numBuckets (reflect old)
{-# INLINE cast #-}