{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RebindableSyntax #-}
{-# OPTIONS_GHC -Wall #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

-- | 'tick' uses the rdtsc chipset to measure time performance of a computation.
--
-- The measurement unit - a 'Cycle' - is one oscillation of the chip crystal as measured by the <https://en.wikipedia.org/wiki/Time_Stamp_Counter rdtsc> instruction which inspects the TSC register.
--
-- For reference, a computer with a frequency of 2 GHz means that one cycle is equivalent to 0.5 nanoseconds.
module Perf.Cycle
  ( -- $usage
    Cycle,
    tick_,
    warmup,
    tick,
    tick',
    tickIO,
    tickNoinline,
    ticks,
    ticksIO,
    ns,
    tickWHNF,
    tickWHNF',
    tickWHNFIO,
    ticksWHNF,
    ticksWHNFIO,
  )
where

import Control.DeepSeq (NFData (..), force)
import qualified Control.Foldl as L (fold, genericLength, premap, sum)
import Control.Monad (replicateM)
import Data.Foldable (toList)
import Data.Sequence (Seq (..))
import GHC.Word (Word64)
import System.CPUTime.Rdtsc
import Prelude

-- $setup
-- >>> import Perf.Cycle
-- >>> import Control.Monad
-- >>> import Data.Foldable (foldl')
-- >>> import qualified Control.Foldl as L
-- >>> let n = 1000
-- >>> let a = 1000
-- >>> let f x = foldl' (+) 0 [1 .. x]

-- $usage
-- >>> import Perf.Cycle
-- >>> import Control.Monad
-- >>> import Data.Foldable (foldl')
-- >>> let n = 1000
-- >>> let a = 1000
-- >>> let f x = foldl' (+) 0 [1 .. x]

-- | an unwrapped Word64
type Cycle = Word64

-- | tick_ measures the number of cycles it takes to read the rdtsc chip twice: the difference is then how long it took to read the clock the second time.
--
-- Below are indicative measurements using tick_:
--
-- >>> onetick <- tick_
-- >>> ticks' <- replicateM 10 tick_
-- >>> manyticks <- replicateM 1000000 tick_
-- >>> let average = L.fold ((/) <$> L.sum <*> L.genericLength)
-- >>> let avticks = average (fromIntegral <$> manyticks)
--
-- > one tick_: 78 cycles
-- > next 10: [20,18,20,20,20,20,18,16,20,20]
-- > average over 1m: 20.08 cycles
-- > 99.999% perc: 7,986
-- > 99.9% perc: 50.97
-- > 99th perc:  24.99
-- > 40th perc:  18.37
-- > [min, 10th, 20th, .. 90th, max]:
-- > 12.00 16.60 17.39 17.88 18.37 18.86 19.46 20.11 20.75 23.04 5.447e5
--
-- The distribution of tick_ measurements is highly skewed, with the maximum being around 50k cycles, which is of the order of a GC. The important point on the distribution is around the 30th to 50th percentile, where you get a clean measure, usually free of GC activity and cache miss-fires
tick_ :: IO Cycle
tick_ :: IO Cycle
tick_ = do
  Cycle
t <- IO Cycle
rdtsc
  Cycle
t' <- IO Cycle
rdtsc
  Cycle -> IO Cycle
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Cycle
t' Cycle -> Cycle -> Cycle
forall a. Num a => a -> a -> a
- Cycle
t)

-- | Warm up the register, to avoid a high first measurement. Without a warmup, one or more larger values can occur at the start of a measurement spree, and often are in the zone of an L2 miss.
--
-- >>> t <- tick_ -- first measure can be very high
-- >>> _ <- warmup 100
-- >>> t <- tick_ -- should be around 20 (3k for ghci)
warmup :: Int -> IO Double
warmup :: Int -> IO Double
warmup Int
n = do
  [Cycle]
ts <- Int -> IO Cycle -> IO [Cycle]
forall (m :: * -> *) a. Applicative m => Int -> m a -> m [a]
replicateM Int
n IO Cycle
tick_
  Double -> IO Double
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Double -> IO Double) -> Double -> IO Double
forall a b. (a -> b) -> a -> b
$ [Cycle] -> Double
forall a (f :: * -> *). (Integral a, Foldable f) => f a -> Double
average [Cycle]
ts

-- | tick where the arguments are lazy, so measurement may include evaluation of thunks that may constitute f and/or a
tick' :: (NFData b) => (a -> b) -> a -> IO (Cycle, b)
tick' :: (a -> b) -> a -> IO (Cycle, b)
tick' a -> b
f a
a = do
  !Cycle
t <- IO Cycle
rdtsc
  !b
a' <- b -> IO b
forall (f :: * -> *) a. Applicative f => a -> f a
pure (b -> b
forall a. NFData a => a -> a
force (b -> b) -> b -> b
forall a b. (a -> b) -> a -> b
$ a -> b
f a
a)
  !Cycle
t' <- IO Cycle
rdtsc
  (Cycle, b) -> IO (Cycle, b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Cycle
t' Cycle -> Cycle -> Cycle
forall a. Num a => a -> a -> a
- Cycle
t, b
a')
{-# INLINE tick' #-}

-- | `tick f a` strictly evaluates f and a, then deeply evaluates f a, returning a (Cycle, f a)
--
-- >>> _ <- warmup 100
-- >>> (cs, _) <- tick f a
--
-- Note that feeding the same computation through tick twice may kick off sharing (aka memoization aka let floating).  Given the importance of sharing to GHC optimisations this is the intended behaviour.  If you want to turn this off then see -fno-full-laziness (and maybe -fno-cse).
tick :: (NFData b) => (a -> b) -> a -> IO (Cycle, b)
tick :: (a -> b) -> a -> IO (Cycle, b)
tick !a -> b
f !a
a = (a -> b) -> a -> IO (Cycle, b)
forall b a. NFData b => (a -> b) -> a -> IO (Cycle, b)
tick' a -> b
f a
a
{-# INLINE tick #-}

tickNoinline :: (NFData b) => (a -> b) -> a -> IO (Cycle, b)
tickNoinline :: (a -> b) -> a -> IO (Cycle, b)
tickNoinline !a -> b
f !a
a = (a -> b) -> a -> IO (Cycle, b)
forall b a. NFData b => (a -> b) -> a -> IO (Cycle, b)
tick' a -> b
f a
a
{-# NOINLINE tickNoinline #-}

-- | measures and deeply evaluates an `IO a`
--
-- >>> (cs, _) <- tickIO (pure (f a))
tickIO :: (NFData a) => IO a -> IO (Cycle, a)
tickIO :: IO a -> IO (Cycle, a)
tickIO IO a
a = do
  Cycle
t <- IO Cycle
rdtsc
  !a
a' <- a -> a
forall a. NFData a => a -> a
force (a -> a) -> IO a -> IO a
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO a
a
  Cycle
t' <- IO Cycle
rdtsc
  (Cycle, a) -> IO (Cycle, a)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Cycle
t' Cycle -> Cycle -> Cycle
forall a. Num a => a -> a -> a
- Cycle
t, a
a')

tickIONoinline :: (NFData a) => IO a -> IO (Cycle, a)
tickIONoinline :: IO a -> IO (Cycle, a)
tickIONoinline = IO a -> IO (Cycle, a)
forall a. NFData a => IO a -> IO (Cycle, a)
tickIO
{-# NOINLINE tickIONoinline #-}

-- | n measurements of a tick
--
-- returns a list of Cycles and the last evaluated f a
--
-- GHC is very good at finding ways to share computation, and anything measuring a computation multiple times is a prime candidate for aggresive ghc treatment. Internally, ticks uses a noinline pragma and a noinline version of to help reduce the chances of memoization, but this is an inexact science in the hands of he author, at least, so interpret with caution.
-- The use of noinline interposes an extra function call, which can highly skew very fast computations.
--
--
-- >>> let n = 1000
-- >>> (cs, fa) <- ticks n f a
--
-- Baseline speed can be highly sensitive to the nature of the function trimmings.  Polymorphic functions can tend to be slightly slower, and functions with lambda expressions can experience dramatic slowdowns.
--
-- > fMono :: Int -> Int
-- > fMono x = foldl' (+) 0 [1 .. x]
-- > fPoly :: (Enum b, Num b, Additive b) => b -> b
-- > fPoly x = foldl' (+) 0 [1 .. x]
-- > fLambda :: Int -> Int
-- > fLambda = \x -> foldl' (+) 0 [1 .. x]
ticks :: NFData b => Int -> (a -> b) -> a -> IO ([Cycle], b)
ticks :: Int -> (a -> b) -> a -> IO ([Cycle], b)
ticks Int
n0 a -> b
f a
a = (a -> b) -> a -> Int -> Seq Cycle -> IO ([Cycle], b)
forall t t t.
(Ord t, Num t) =>
t -> t -> t -> Seq Cycle -> IO ([Cycle], b)
go a -> b
f a
a Int
n0 Seq Cycle
forall a. Seq a
Empty
  where
    go :: t -> t -> t -> Seq Cycle -> IO ([Cycle], b)
go t
f' t
a' t
n Seq Cycle
ts
      | t
n t -> t -> Bool
forall a. Ord a => a -> a -> Bool
<= t
0 = ([Cycle], b) -> IO ([Cycle], b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Seq Cycle -> [Cycle]
forall (t :: * -> *) a. Foldable t => t a -> [a]
toList Seq Cycle
ts, a -> b
f a
a)
      | Bool
otherwise = do
        (Cycle
t, b
_) <- (a -> b) -> a -> IO (Cycle, b)
forall b a. NFData b => (a -> b) -> a -> IO (Cycle, b)
tickNoinline a -> b
f a
a
        t -> t -> t -> Seq Cycle -> IO ([Cycle], b)
go t
f' t
a' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1) (Seq Cycle
ts Seq Cycle -> Cycle -> Seq Cycle
forall a. Seq a -> a -> Seq a
:|> Cycle
t)
{-# NOINLINE ticks #-}

-- | n measuremenst of a tickIO
--
-- returns an IO tuple; list of Cycles and the last evaluated f a
--
-- >>> (cs, fa) <- ticksIO n (pure $ f a)
ticksIO :: (NFData a) => Int -> IO a -> IO ([Cycle], a)
ticksIO :: Int -> IO a -> IO ([Cycle], a)
ticksIO Int
n0 IO a
a = IO a -> Int -> Seq Cycle -> IO ([Cycle], a)
forall t a.
(Ord t, Num t, NFData a) =>
IO a -> t -> Seq Cycle -> IO ([Cycle], a)
go IO a
a Int
n0 Seq Cycle
forall a. Seq a
Empty
  where
    go :: IO a -> t -> Seq Cycle -> IO ([Cycle], a)
go IO a
a' t
n Seq Cycle
ts
      | t
n t -> t -> Bool
forall a. Ord a => a -> a -> Bool
<= t
0 = do
        a
a'' <- IO a
a'
        ([Cycle], a) -> IO ([Cycle], a)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Seq Cycle -> [Cycle]
forall (t :: * -> *) a. Foldable t => t a -> [a]
toList Seq Cycle
ts, a
a'')
      | Bool
otherwise = do
        (Cycle
t, a
_) <- IO a -> IO (Cycle, a)
forall a. NFData a => IO a -> IO (Cycle, a)
tickIONoinline IO a
a'
        IO a -> t -> Seq Cycle -> IO ([Cycle], a)
go IO a
a' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1) (Seq Cycle
ts Seq Cycle -> Cycle -> Seq Cycle
forall a. Seq a -> a -> Seq a
:|> Cycle
t)
{-# NOINLINE ticksIO #-}

-- | make a series of measurements on a list of a's to be applied to f, for a tick function.
--
-- Tends to be fragile to sharing issues, but very useful to determine computation Order
--
-- > ns ticks n f [1,10,100,1000]
ns :: (a -> IO ([Cycle], b)) -> [a] -> IO ([[Cycle]], [b])
ns :: (a -> IO ([Cycle], b)) -> [a] -> IO ([[Cycle]], [b])
ns a -> IO ([Cycle], b)
t [a]
as = do
  [([Cycle], b)]
cs <- [IO ([Cycle], b)] -> IO [([Cycle], b)]
forall (t :: * -> *) (m :: * -> *) a.
(Traversable t, Monad m) =>
t (m a) -> m (t a)
sequence ([IO ([Cycle], b)] -> IO [([Cycle], b)])
-> [IO ([Cycle], b)] -> IO [([Cycle], b)]
forall a b. (a -> b) -> a -> b
$ a -> IO ([Cycle], b)
t (a -> IO ([Cycle], b)) -> [a] -> [IO ([Cycle], b)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [a]
as
  ([[Cycle]], [b]) -> IO ([[Cycle]], [b])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (([Cycle], b) -> [Cycle]
forall a b. (a, b) -> a
fst (([Cycle], b) -> [Cycle]) -> [([Cycle], b)] -> [[Cycle]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [([Cycle], b)]
cs, ([Cycle], b) -> b
forall a b. (a, b) -> b
snd (([Cycle], b) -> b) -> [([Cycle], b)] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [([Cycle], b)]
cs)

-- | average of an Integral foldable
--
-- > cAv <- average <$> ticks n f a
average :: (Integral a, Foldable f) => f a -> Double
average :: f a -> Double
average = Fold a Double -> f a -> Double
forall (f :: * -> *) a b. Foldable f => Fold a b -> f a -> b
L.fold ((a -> Double) -> Fold Double Double -> Fold a Double
forall a b r. (a -> b) -> Fold b r -> Fold a r
L.premap a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Double -> Double -> Double
forall a. Fractional a => a -> a -> a
(/) (Double -> Double -> Double)
-> Fold Double Double -> Fold Double (Double -> Double)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Fold Double Double
forall a. Num a => Fold a a
L.sum Fold Double (Double -> Double)
-> Fold Double Double -> Fold Double Double
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Fold Double Double
forall b a. Num b => Fold a b
L.genericLength))

-- | WHNF versions
tickWHNF :: (a -> b) -> a -> IO (Cycle, b)
tickWHNF :: (a -> b) -> a -> IO (Cycle, b)
tickWHNF !a -> b
f !a
a = (a -> b) -> a -> IO (Cycle, b)
forall a b. (a -> b) -> a -> IO (Cycle, b)
tickWHNF' a -> b
f a
a

tickWHNFNoinline :: (a -> b) -> a -> IO (Cycle, b)
tickWHNFNoinline :: (a -> b) -> a -> IO (Cycle, b)
tickWHNFNoinline !a -> b
f !a
a = (a -> b) -> a -> IO (Cycle, b)
forall a b. (a -> b) -> a -> IO (Cycle, b)
tickWHNF' a -> b
f a
a
{-# NOINLINE tickWHNFNoinline #-}

-- | WHNF version
tickWHNF' :: (a -> b) -> a -> IO (Cycle, b)
tickWHNF' :: (a -> b) -> a -> IO (Cycle, b)
tickWHNF' a -> b
f a
a = do
  !Cycle
t <- IO Cycle
rdtsc
  !b
a' <- b -> IO b
forall (f :: * -> *) a. Applicative f => a -> f a
pure (a -> b
f a
a)
  !Cycle
t' <- IO Cycle
rdtsc
  (Cycle, b) -> IO (Cycle, b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Cycle
t' Cycle -> Cycle -> Cycle
forall a. Num a => a -> a -> a
- Cycle
t, b
a')

-- | WHNF version
tickWHNFIO :: IO a -> IO (Cycle, a)
tickWHNFIO :: IO a -> IO (Cycle, a)
tickWHNFIO IO a
a = do
  Cycle
t <- IO Cycle
rdtsc
  !a
a' <- IO a
a
  Cycle
t' <- IO Cycle
rdtsc
  (Cycle, a) -> IO (Cycle, a)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Cycle
t' Cycle -> Cycle -> Cycle
forall a. Num a => a -> a -> a
- Cycle
t, a
a')

tickWHNFIONoinline :: IO a -> IO (Cycle, a)
tickWHNFIONoinline :: IO a -> IO (Cycle, a)
tickWHNFIONoinline = IO a -> IO (Cycle, a)
forall a. IO a -> IO (Cycle, a)
tickWHNFIO
{-# NOINLINE tickWHNFIONoinline #-}

-- | WHNF version
ticksWHNF :: Int -> (a -> b) -> a -> IO ([Cycle], b)
ticksWHNF :: Int -> (a -> b) -> a -> IO ([Cycle], b)
ticksWHNF Int
n0 a -> b
f a
a = (a -> b) -> a -> Int -> Seq Cycle -> IO ([Cycle], b)
forall t t t.
(Ord t, Num t) =>
t -> t -> t -> Seq Cycle -> IO ([Cycle], b)
go a -> b
f a
a Int
n0 Seq Cycle
forall a. Seq a
Empty
  where
    go :: t -> t -> t -> Seq Cycle -> IO ([Cycle], b)
go t
f' t
a' t
n Seq Cycle
ts
      | t
n t -> t -> Bool
forall a. Ord a => a -> a -> Bool
<= t
0 = ([Cycle], b) -> IO ([Cycle], b)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Seq Cycle -> [Cycle]
forall (t :: * -> *) a. Foldable t => t a -> [a]
toList Seq Cycle
ts, a -> b
f a
a)
      | Bool
otherwise = do
        (Cycle
t, b
_) <- (a -> b) -> a -> IO (Cycle, b)
forall a b. (a -> b) -> a -> IO (Cycle, b)
tickWHNFNoinline a -> b
f a
a
        t -> t -> t -> Seq Cycle -> IO ([Cycle], b)
go t
f' t
a' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1) (Seq Cycle
ts Seq Cycle -> Cycle -> Seq Cycle
forall a. Seq a -> a -> Seq a
:|> Cycle
t)
{-# NOINLINE ticksWHNF #-}

-- | WHNF version
ticksWHNFIO :: Int -> IO a -> IO ([Cycle], a)
ticksWHNFIO :: Int -> IO a -> IO ([Cycle], a)
ticksWHNFIO Int
n0 IO a
a = IO a -> Int -> Seq Cycle -> IO ([Cycle], a)
forall t a.
(Ord t, Num t) =>
IO a -> t -> Seq Cycle -> IO ([Cycle], a)
go IO a
a Int
n0 Seq Cycle
forall a. Seq a
Empty
  where
    go :: IO a -> t -> Seq Cycle -> IO ([Cycle], a)
go IO a
a' t
n Seq Cycle
ts
      | t
n t -> t -> Bool
forall a. Ord a => a -> a -> Bool
<= t
0 = do
        a
a'' <- IO a
a'
        ([Cycle], a) -> IO ([Cycle], a)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Seq Cycle -> [Cycle]
forall (t :: * -> *) a. Foldable t => t a -> [a]
toList Seq Cycle
ts, a
a'')
      | Bool
otherwise = do
        (Cycle
t, a
_) <- IO a -> IO (Cycle, a)
forall a. IO a -> IO (Cycle, a)
tickWHNFIONoinline IO a
a'
        IO a -> t -> Seq Cycle -> IO ([Cycle], a)
go IO a
a' (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1) (Seq Cycle
ts Seq Cycle -> Cycle -> Seq Cycle
forall a. Seq a -> a -> Seq a
:|> Cycle
t)
{-# NOINLINE ticksWHNFIO #-}