src/Distributed/Failure/Phi.hs

{-# LANGUAGE FlexibleInstances #-}
module Distributed.Failure.Phi (
   Phi(..), phi
 ) where

import           Data.Foldable
import           Data.List.NonEmpty (NonEmpty)
import           Data.Sequence (Seq, (<|))
import qualified Data.Sequence as Seq
import           Data.Time
import           Distributed.Failure.Class
import           Statistics.Distribution
import           Statistics.Distribution.Normal

data Phi =
  Phi {
    _pThresh :: Double
  , _pWindow :: Int
  , _pLog :: Seq Double
  }
  deriving (Show, Eq, Ord)

{- | Start a phi-acrual failure detector, given a Φ, a window size, and the
 -   starting messurements.
 -}
phi :: Double -> Int -> NonEmpty DiffTime -> Phi
phi t w l = Phi t w (Seq.fromList . fmap realToFrac . toList $ l)

instance FailureDetector Phi where
  observe (Phi t w l) d = Phi t w (Seq.take w $ (realToFrac d) <| l)
  suspected (Phi t _ l) d =
      t <= negate (logBase 10 pLater)
    where
      s = realToFrac . length $ l
      m = sum l/s
      sd = sqrt $ (sum . fmap (\i -> (i - m)^(2::Int)) $ l)/(s-1)
      -- Our effective standard deviation, is the calculated sd as above when in range.
      -- Sadly, during startup, or when our message regularity is higher then our
      -- clock precision, our sd diverges. In these cases we just take the mean,
      -- or, when the mean is also divergent, one.
      -- This means that in edge cases we do not respect the requested phi fully.
      -- The values of phi are still related to each other in the same ways though.
      -- Addtionally we still satisfy the failure detector requirements in that
      -- we still will eventually suspect any process that fails to communicate,
      -- and that we will return to trusting a correct process.
      esd = if (sd > 0) && (sd < (1/0))
            then sd
            else if m>0 then m else 1
      dist = normalDistr m esd
      pLater = complCumulative dist (realToFrac d)