{-# LANGUAGE DataKinds #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# OPTIONS_GHC -Wall #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

-- | 'tick' uses the rdtsc chipset to measure time performance of a computation.
--
-- The measurement unit - a 'Cycle' - is one oscillation of the chip crystal as measured by the <https://en.wikipedia.org/wiki/Time_Stamp_Counter rdtsc> instruction which inspects the TSC register.
--
-- For reference, a computer with a frequency of 2 GHz means that one cycle is equivalent to 0.5 nanoseconds.
--
module Perf.Cycle
  ( -- $setup
    Cycle
  , tick_
  , warmup
  , tick
  , app
  , tickIO
  , ticks
  , qtick
  , ticksIO
  , tickns
  , force
  , replicateM'
  , average
  , deciles
  , percentile
  ) where

import qualified Control.Foldl as L
import Data.List
import Data.TDigest
import NumHask.Prelude hiding (force)
import System.CPUTime.Rdtsc
import qualified Protolude

-- $setup
-- >>> :set -XNoImplicitPrelude
-- >>> import Perf.Cycle
-- >>> let n = 1000
-- >>> let a = 1000
-- >>> let f x = foldl' (+) 0 [1 .. x]
--


-- | an unwrapped Word64
type Cycle = Word64

instance AdditiveMagma Cycle where
  plus = (Protolude.+)

instance AdditiveUnital Cycle where
  zero = 0

instance AdditiveAssociative Cycle

instance AdditiveCommutative Cycle

instance Additive Cycle

instance AdditiveInvertible Cycle where
  negate = Protolude.negate

instance AdditiveGroup Cycle

instance ToInteger Cycle where
    toInteger = Protolude.toInteger

-- | tick_ measures the number of cycles it takes to read the rdtsc chip twice: the difference is then how long it took to read the clock the second time.
--
-- Below are indicative measurements using tick_:
--
-- >>> onetick <- tick_
-- >>> ticks' <- replicateM 10 tick_
-- >>> manyticks <- replicateM 1000000 tick_
-- >>> let average = L.fold ((/) <$> L.sum <*> L.genericLength)
-- >>> let avticks = average (fromIntegral <$> manyticks)
-- >>> let qticks = deciles 10 manyticks
-- >>> let tick999 = percentile 0.999 manyticks
--
-- > one tick_: 78 cycles
-- > next 10: [20,18,20,20,20,20,18,16,20,20]
-- > average over 1m: 20.08 cycles
-- > 99.999% perc: 7,986
-- > 99.9% perc: 50.97
-- > 99th perc:  24.99
-- > 40th perc:  18.37
-- > [min, 10th, 20th, .. 90th, max]:
-- > 12.00 16.60 17.39 17.88 18.37 18.86 19.46 20.11 20.75 23.04 5.447e5
--
-- The distribution of tick_ measurements is highly skewed, with the maximum being around 50k cycles, which is of the order of a GC. The important point on the distribution is around the 30th to 50th percentile, where you get a clean measure, usually free of GC activity and cache miss-fires
tick_ :: IO Cycle
tick_ = do
  t <- rdtsc
  t' <- rdtsc
  pure (t' - t)

-- | Warm up the register, to avoid a high first measurement. Without a warmup, one or more larger values can occur at the start of a measurement spree, and often are in the zone of an L2 miss.
--
-- >>> t <- tick_ -- first measure can be very high
-- >>> _ <- warmup 100
-- >>> t <- tick_ -- should be around 20 (3k for ghci)
--
warmup :: Int -> IO Double
warmup n = do
  ts <- replicateM n tick_
  pure $ average ts

-- | `tick f a` strictly applies a to f, and returns a (Cycle, f a)
--
-- >>> _ <- warmup 100
-- >>> (cs, _) <- tick f a
--
-- > one tick: 197012 cycles
-- > average over 1000: 10222.79 cycles -- 10 cycles per operation
-- > [min, 30th, median, 90th, 99th, max]:
-- > 1.002e4 1.011e4 1.013e4 1.044e4 1.051e4 2.623e4
tick :: (a -> b) -> a -> IO (Cycle, b)
tick f a = do
  !t <- rdtsc
  !a' <- pure (f a)
  !t' <- rdtsc
  pure (t' - t, a')

-- | evaluates and measures an `IO a`
--
-- >>> (cs, _) <- tickIO (pure (f a))
--
tickIO :: IO a -> IO (Cycle, a)
tickIO a = do
  t <- rdtsc
  !a' <- a
  t' <- rdtsc
  pure (t' - t, a')

-- | needs more testing
app :: t -> () -> t
app e () = e
{-# NOINLINE app #-}

-- | n measurements of a tick
--
-- returns a list of Cycles and the last evaluated f a
--
-- GHC is very good as memoization, and any of the functions that measuring a computation multiple times are fraught.  When a computation actually gets memoized is an inexact science.  Current readings are:
--
-- > sum to 1000.0
-- > Perf.ticks n f a                        8.37e3 cycles
-- > Main.ticks n f a                        8.38e3 cycles
-- > Perf.ticksIO n (pure $ f a)             8.38e3 cycles
-- > Perf.qtick n f a                        8.38e3 cycles
-- > Main.qtick n f a                        8.38e3 cycles
-- > replicateM n (tick f a)                 8.37e3 cycles
-- > replicateM' n (tick f a)                9.74e3 cycles
-- > replicateM n (tickIO (pure (f a)))      1.21e4 cycles
-- > replicateM n (tick (app (f a)) ())      9.72e3 cycles
-- > replicateM n (tick identity (f n))      18.2 cycles
-- > replicateM n (tick (const (f a)) ())    9.71e3 cycles
-- > (replicateM n . tick f) <$> [1,10,100,1000,10000]:  16.3 16.2 16.3 16.2 16.2
-- > Perf.tickns n f [1,10,100,1000,10000]:  16.2 16.2 16.2 16.2 16.2
--
-- >>> let n = 1000
-- >>> (cs, fa) <- ticks n f a
--
ticks :: Int -> (a -> b) -> a -> IO ([Cycle], b)
ticks n f a = do
  ts <- replicateM' n (tick f a)
  pure (fst <$> ts, snd $ last ts)
{-# INLINE ticks #-}

-- | returns the 40th percentile measurement and the last evaluated f a
--
-- >>> (c, fa) <- qtick n f a
--
qtick :: Int -> (a -> b) -> a -> IO (Double, b)
qtick n f a = do
  ts <- replicateM' n (tick f a)
  pure (percentile 0.4 $ fst <$> ts, snd $ last ts)
{-# INLINE qtick #-}

-- | n measuremenst of a tickIO
--
-- returns an IO tuple; list of Cycles and the last evaluated f a
--
-- >>> (cs, fa) <- ticksIO n (pure $ f a)
--
ticksIO :: Int -> IO a -> IO ([Cycle], a)
ticksIO n a = do
    cs <- replicateM n (tickIO a)
    pure (fst <$> cs, last $ snd <$> cs)

-- | n measurements on each of a list of a's to be applied to f.
--
-- Currently memoizing it's ass off
--
-- > tickns n f [1,10,100,1000]
--
tickns :: Int -> (a -> b) -> [a] -> IO ([[Cycle]], [b])
tickns n f as = do
  cs <- sequence $ ticks n f <$> as
  pure (fst <$> cs, snd <$> cs)

-- | extra oomph for those hard to reach evaluations
force :: (NFData a) => a -> a
force x = x `deepseq` x

-- | a replicateM with good attributes
replicateM' :: Monad m => Int -> m a -> m [a]
replicateM' n op' = go n []
  where
    go 0 acc = return $ reverse acc
    go n' acc = do
      x <- op'
      go (n' - 1) (x : acc)

-- | average of a Cycle foldable
--
-- > cAv <- average <$> ticks n f a
--
average :: (Foldable f) => f Cycle -> Double
average = L.fold (L.premap fromIntegral ((/) <$> L.sum <*> L.genericLength))

-- | compute deciles
--
-- > c5 <- decile 5 <$> ticks n f a
--
deciles :: (Functor f, Foldable f) => Int -> f Cycle -> [Double]
deciles n xs =
  (\x -> fromMaybe 0 $ quantile x (tdigest (fromIntegral <$> xs) :: TDigest 25)) <$>
  ((/ fromIntegral n) . fromIntegral <$> [0 .. n]) :: [Double]

-- | compute a percentile
--
-- > c <- percentoile 0.4 <$> ticks n f a
--
percentile :: (Functor f, Foldable f) => Double -> f Cycle -> Double
percentile p xs = fromMaybe 0 $ quantile p (tdigest (fromIntegral <$> xs) :: TDigest 25)