{-# LANGUAGE TupleSections #-}
{-# LANGUAGE UndecidableInstances #-}

-- |
-- Module      :  Mcmc.Proposal.Hamiltonian
-- Description :  Hamiltonian Monte Carlo proposal
-- Copyright   :  (c) 2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  experimental
-- Portability :  portable
--
-- Creation date: Mon Jul  5 12:59:42 2021.
--
-- The Hamiltonian Monte Carlo (HMC) proposal.
--
-- For references, see:
--
-- - [1] Chapter 5 of Handbook of Monte Carlo: Neal, R. M., MCMC Using
--   Hamiltonian Dynamics, In S. Brooks, A. Gelman, G. Jones, & X. Meng (Eds.),
--   Handbook of Markov Chain Monte Carlo (2011), CRC press.
--
-- - [2] Gelman, A., Carlin, J. B., Stern, H. S., & Rubin, D. B., Bayesian data
--   analysis (2014), CRC Press.
--
-- - [3] Review by Betancourt and notes: Betancourt, M., A conceptual
--   introduction to Hamiltonian Monte Carlo, arXiv, 1701–02434 (2017).
--
-- NOTE on implementation:
--
-- - The implementation assumes the existence of the 'Gradient'. Like so, the
--   user can use automatic or manual differentiation, depending on the problem
--   at hand.
--
-- - The Hamiltonian proposal acts on a vector of storable 'Values'. Functions
--   converting the state to and from this vector have to be provided. See
--   'HSettings'.
--
-- - The desired acceptance rate is 0.65, although the dimension of the proposal
--   is high.
--
-- - The speed of this proposal can change drastically when tuned because the
--   leapfrog trajectory length is changed.
--
-- - The Hamiltonian proposal is agnostic of the actual prior and likelihood
--   functions, and so, points with zero posterior probability cannot be
--   detected. This affects models with constrained parameters. See Gelman p.
--   303. This problem can be ameliorated by providing a 'Validate' function so
--   that the proposal can gracefully fail as soon as the state becomes invalid.
module Mcmc.Proposal.Hamiltonian
  ( Values,
    Gradient,
    Validate,
    Masses,
    LeapfrogTrajectoryLength,
    LeapfrogScalingFactor,
    HTuneLeapfrog (..),
    HTuneMasses (..),
    HTune (..),
    HSettings (..),
    hamiltonian,
  )
where

import qualified Data.Vector as VB
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Unboxed as VU
import Mcmc.Proposal
import qualified Numeric.LinearAlgebra as L
import Numeric.Log
import Numeric.MathFunctions.Constants
import qualified Statistics.Covariance as S
import qualified Statistics.Function as S
import qualified Statistics.Sample as S
import System.Random.MWC

-- TODO: No-U-turn sampler (NUTS). Ameliorates necessity to determine the
-- leapfrog trajectory length L. (I think this is a necessary extension.)

-- TODO: Riemannian adaptation: State-dependent mass matrix. (Seems a little bit
-- of an overkill.)

-- | The Hamiltonian proposal acts on a vector of floating point values.
type Values = L.Vector Double

-- | Gradient of the log posterior function.
--
-- The gradient has to be provided for the complete state. The reason is that
-- the gradient may change if parameters untouched by the Hamiltonian proposal
-- are altered by other proposals.
type Gradient a = a -> a

-- | Function validating the state.
--
-- Useful if parameters are constrained.
--
-- Also the validity of the state may depend on parameters untouched by the
-- Hamiltonian proposal.
type Validate a = a -> Bool

-- | Parameter mass matrix.
--
-- The masses roughly describe how reluctant the particles move through the
-- state space. If a parameter has higher mass, the momentum in this direction
-- will be changed less by the provided gradient, than when the same parameter
-- has lower mass. Off-diagonal entries describe the covariance structure. If
-- two parameters are negatively correlated, their generated initial momenta are
-- likely to have opposite signs.
--
-- The proposal is more efficient if masses are assigned according to the
-- inverse (co)-variance structure of the posterior function. That is,
-- parameters changing on larger scales should have lower masses than parameters
-- changing on lower scales. In particular, the optimal entries of the diagonal
-- of the mass matrix are the inverted variances of the parameters distributed
-- according to the posterior function.
--
-- Of course, the scales of the parameters of the posterior function are usually
-- unknown. Often, it is sufficient to
--
-- - set the diagonal entries of the mass matrix to identical values roughly
--   scaled with the inverted estimated average variance of the posterior
--   function; or even to
--
-- - set all diagonal entries of the mass matrix to 1.0, and all other entries
--   to 0.0, and trust the tuning algorithm (see 'HTune') to find the correct
--   values.
type Masses = L.Herm Double

-- | Mean leapfrog trajectory length \(L\).
--
-- Number of leapfrog steps per proposal.
--
-- To avoid problems with ergodicity, the actual number of leapfrog steps is
-- sampled per proposal from a discrete uniform distribution over the interval
-- \([\text{floor}(0.8L),\text{ceiling}(1.2L)]\).
--
-- For a discussion of ergodicity and reasons why randomization is important,
-- see [1] p. 15; also mentioned in [2] p. 304.
--
-- Usually set to 10, but larger values may be desirable.
--
-- NOTE: To avoid errors, the left bound has an additional hard minimum of 1,
-- and the right bound is required to be larger equal than the left bound.
--
-- NOTE: Call 'error' if value is less than 1.
type LeapfrogTrajectoryLength = Int

-- | Mean of leapfrog scaling factor \(\epsilon\).
--
-- Determines the size of each leapfrog step.
--
-- To avoid problems with ergodicity, the actual leapfrog scaling factor is
-- sampled per proposal from a continuous uniform distribution over the interval
-- \((0.8\epsilon,1.2\epsilon]\).
--
-- For a discussion of ergodicity and reasons why randomization is important,
-- see [1] p. 15; also mentioned in [2] p. 304.
--
-- Usually set such that \( L \epsilon = 1.0 \), but smaller values may be
-- required if acceptance rates are low.
--
-- NOTE: Call 'error' if value is zero or negative.
type LeapfrogScalingFactor = Double

-- Internal. Values; target state containing parameters.
type Positions = Values

-- Internal. Momenta of the parameters.
type Momenta = L.Vector Double

-- | Tune leapfrog parameters?
data HTuneLeapfrog
  = HNoTuneLeapfrog
  | -- | We expect that the larger the leapfrog scaling factor the lower the
    -- acceptance ratio. Consequently, if the acceptance rate is too low, the
    -- leapfrog scaling factor is decreased and vice versa. Further, the leapfrog
    -- trajectory length is scaled such that the product of the leapfrog scaling
    -- factor and leapfrog trajectory length stays roughly constant.
    HTuneLeapfrog
  deriving (Eq, Show)

-- | Tune masses?
--
-- The masses are tuned according to the (co)variances of the parameters
-- obtained from the posterior distribution over the last auto tuning interval.
data HTuneMasses
  = HNoTuneMasses
  | -- | Diagonal only: The variances of the parameters are calculated and the
    -- masses are amended using the old masses and the inverted variances. If, for
    -- a specific coordinate, the sample size is 60 or lower, or if the calculated
    -- variance is out of predefined bounds [1e-6, 1e6], the mass of the affected
    -- position is not changed.
    HTuneDiagonalMassesOnly
  | -- | All masses: The covariance matrix of the parameters is estimated and the
    -- inverted matrix (sometimes called precision matrix) is used as mass matrix.
    -- This procedure is error prone, but models with high correlations between
    -- parameters it is necessary to tune off-diagonal entries. The full mass
    -- matrix is only tuned if more than 200 samples are available. For these
    -- reasons, when tuning all masses it is recommended to use tuning settings
    -- such as
    --
    -- @
    -- BurnInWithCustomAutoTuning ([10, 20 .. 200] ++ replicate 5 500)
    -- @
    HTuneAllMasses
  deriving (Eq, Show)

-- | Tuning settings.
data HTune = HTune HTuneLeapfrog HTuneMasses
  deriving (Eq, Show)

-- | Specifications of the Hamilton Monte Carlo proposal.
data HSettings a = HSettings
  { -- | The sample state is used for error checks and to calculate the dimension
    -- of the proposal.
    hSample :: a,
    -- | Extract values to be manipulated by the Hamiltonian proposal from the
    -- state.
    hToVector :: a -> Values,
    -- | Put those values back into the state.
    hFromVectorWith :: a -> Values -> a,
    hGradient :: Gradient a,
    hMaybeValidate :: Maybe (Validate a),
    hMasses :: Masses,
    hLeapfrogTrajectoryLength :: LeapfrogTrajectoryLength,
    hLeapfrogScalingFactor :: LeapfrogScalingFactor,
    hTune :: HTune
  }

checkHSettings :: Eq a => HSettings a -> Maybe String
checkHSettings (HSettings x toVec fromVec _ _ masses l eps _)
  | any (<= 0) diagonalMasses = Just "checkHSettings: Some diagonal entries of the mass matrix are zero or negative."
  | nrows /= ncols = Just "checkHSettings: Mass matrix is not square."
  | fromVec x xVec /= x = Just "checkHSettings: 'fromVectorWith x (toVector x) /= x' for sample state."
  | L.size xVec /= nrows = Just "checkHSettings: Mass matrix has different size than 'toVector x', where x is sample state."
  | l < 1 = Just "checkHSettings: Leapfrog trajectory length is zero or negative."
  | eps <= 0 = Just "checkHSettings: Leapfrog scaling factor is zero or negative."
  | otherwise = Nothing
  where
    ms = L.unSym masses
    diagonalMasses = L.toList $ L.takeDiag ms
    nrows = L.rows ms
    ncols = L.cols ms
    xVec = toVec x

-- Internal. Mean vector containing zeroes.
type HMu = L.Vector Double

-- Internal. Symmetric, inverted mass matrix.
type HMassesInv = L.Herm Double

-- Internal. Symmetric, inverted mass matrix scaled with the leapfrog step size
-- epsilon.
type HMassesInvEps = L.Herm Double

-- Internal. Logarithm of the determinant of the mass matrix.
type HLogDetMasses = Double

-- Internal data type containing memoized values.
data HData = HData
  { _hMu :: HMu,
    _hMassesInv :: HMassesInv,
    _hMassesInvEps :: HMassesInvEps,
    _hLogDetMasses :: HLogDetMasses
  }

-- Call 'error' if the determinant of the covariance matrix is negative.
getHData :: HSettings a -> HData
getHData s =
  -- The multivariate normal distribution requires a positive definite matrix
  -- with positive determinant.
  if sign == 1.0
    then HData mu massesInvH massesInvEpsH logDetMasses
    else
      let msg =
            "hamiltonianSimple: Determinant of covariance matrix is negative."
              <> " The logarithm of the absolute value of the determinant is: "
              <> show logDetMasses
              <> "."
       in error msg
  where
    ms = hMasses s
    nrows = L.rows $ L.unSym ms
    mu = L.fromList $ replicate nrows 0.0
    (massesInv, (logDetMasses, sign)) = L.invlndet $ L.unSym ms
    -- In theory we can trust that the matrix is symmetric here, because the
    -- inverse of a symmetric matrix is symmetric. However, one may want to
    -- implement a check anyways.
    massesInvH = L.trustSym massesInv
    eps = hLeapfrogScalingFactor s
    massesInvEpsH = L.scale eps massesInvH

generateMomenta ::
  -- Provided so that it does not have to be recreated.
  HMu ->
  Masses ->
  GenIO ->
  IO Momenta
generateMomenta mu masses gen = do
  seed <- uniformM gen :: IO Int
  let momenta = L.gaussianSample seed 1 mu masses
  return $ L.flatten momenta

-- Prior distribution of momenta.
--
-- Log of density of multivariate normal distribution with given parameters.
-- https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Density_function.
logDensityMultivariateNormal ::
  -- Mean vector.
  L.Vector Double ->
  -- Inverted covariance matrix.
  L.Herm Double ->
  -- Logarithm of the determinant of the covariance matrix.
  Double ->
  -- Value vector.
  L.Vector Double ->
  Log Double
logDensityMultivariateNormal mu sigmaInvH logDetSigma xs =
  Exp $ c + (-0.5) * (logDetSigma + ((dxs L.<# sigmaInv) L.<.> dxs))
  where
    dxs = xs - mu
    k = fromIntegral $ L.size mu
    c = negate $ m_ln_sqrt_2_pi * k
    sigmaInv = L.unSym sigmaInvH

leapfrog ::
  Gradient Positions ->
  Maybe (Validate Positions) ->
  HMassesInvEps ->
  LeapfrogTrajectoryLength ->
  LeapfrogScalingFactor ->
  Positions ->
  Momenta ->
  -- Maybe (Positions', Momenta'); fail if state is not valid.
  Maybe (Positions, Momenta)
leapfrog grad mVal hMassesInvEps l eps theta phi = do
  let -- The first half step of the momenta.
      phiHalf = leapfrogStepMomenta (0.5 * eps) grad theta phi
  -- L-1 full steps. This gives the positions theta_{L-1}, and the momenta
  -- phi_{L-1/2}.
  (thetaLM1, phiLM1Half) <- go (l - 1) (Just (theta, phiHalf))
  -- The last full step of the positions.
  thetaL <- valF $ leapfrogStepPositions hMassesInvEps thetaLM1 phiLM1Half
  let -- The last half step of the momenta.
      phiL = leapfrogStepMomenta (0.5 * eps) grad thetaL phiLM1Half
  return (thetaL, phiL)
  where
    valF x = case mVal of
      Nothing -> Just x
      Just f -> if f x then Just x else Nothing
    go _ Nothing = Nothing
    go 0 (Just (t, p)) = Just (t, p)
    go n (Just (t, p)) =
      let t' = leapfrogStepPositions hMassesInvEps t p
          p' = leapfrogStepMomenta eps grad t' p
          r = (,p') <$> valF t'
       in go (n - 1) r

leapfrogStepMomenta ::
  -- Step size (usually a multiple of the leapfrog scaling factor).
  Double ->
  Gradient Positions ->
  -- Current positions.
  Positions ->
  -- Current momenta.
  Momenta ->
  -- New momenta.
  Momenta
leapfrogStepMomenta eps grad theta phi = phi + L.scale eps (grad theta)

leapfrogStepPositions ::
  HMassesInvEps ->
  -- Current positions.
  Positions ->
  -- Current momenta.
  Momenta ->
  -- New positions.
  Positions
leapfrogStepPositions hMassesInvEps theta phi = theta + (L.unSym hMassesInvEps L.#> phi)

massesToTuningParameters :: Masses -> AuxiliaryTuningParameters
massesToTuningParameters = VB.convert . L.flatten . L.unSym

tuningParametersToMasses ::
  -- Dimension of the mass matrix.
  Int ->
  AuxiliaryTuningParameters ->
  Masses
tuningParametersToMasses d = L.trustSym . L.reshape d . VB.convert

hTuningParametersToSettings ::
  HSettings a ->
  TuningParameter ->
  AuxiliaryTuningParameters ->
  Either String (HSettings a)
hTuningParametersToSettings s t ts
  | nTsNotOK =
      Left "hTuningParametersToSettings: Auxiliary variables do not have correct dimension."
  | otherwise =
      Right $
        s
          { hMasses = msTuned,
            hLeapfrogTrajectoryLength = lTuned,
            hLeapfrogScalingFactor = eTuned
          }
  where
    ms = hMasses s
    d = L.rows $ L.unSym ms
    l = hLeapfrogTrajectoryLength s
    e = hLeapfrogScalingFactor s
    (HTune tlf tms) = hTune s
    nTsNotOK =
      let nTs = VU.length ts
       in case tms of
            HNoTuneMasses -> nTs /= 0
            _ -> nTs /= d * d
    msTuned = case tms of
      HNoTuneMasses -> ms
      _ -> tuningParametersToMasses d ts
    -- The larger epsilon, the larger the proposal step size and the lower the
    -- expected acceptance ratio.
    --
    -- Further, we roughly keep \( L * \epsilon = 1.0 \). The equation is not
    -- correct, because we pull L closer to the original value to keep the
    -- runtime somewhat acceptable.
    (lTuned, eTuned) = case tlf of
      HNoTuneLeapfrog -> (l, e)
      HTuneLeapfrog -> (ceiling $ fromIntegral l / (t ** 0.9) :: Int, t * e)

hamiltonianSimpleWithTuningParameters ::
  HSettings a ->
  TuningParameter ->
  AuxiliaryTuningParameters ->
  Either String (ProposalSimple a)
hamiltonianSimpleWithTuningParameters s t ts =
  hamiltonianSimple <$> hTuningParametersToSettings s t ts

-- The inverted covariance matrix and the log determinant of the covariance
-- matrix are calculated by 'hamiltonianSimple'.
hamiltonianSimpleWithMemoizedCovariance ::
  HSettings a ->
  HData ->
  ProposalSimple a
hamiltonianSimpleWithMemoizedCovariance st dt x g = do
  phi <- generateMomenta mu masses g
  lRan <- uniformR (lL, lR) g
  eRan <- uniformR (eL, eR) g
  case leapfrog gradientVec mValVec massesInvEps lRan eRan theta phi of
    Nothing -> return (x, 0.0, 1.0)
    Just (theta', phi') ->
      let -- Prior of momenta.
          prPhi = logDensityMultivariateNormal mu massesInv logDetMasses phi
          -- NOTE: Neal page 12: In order for the proposal to be in detailed
          -- balance, the momenta have to be negated before proposing the new
          -- value. This is not required here since the prior involves a
          -- multivariate normal distribution with means 0.
          prPhi' = logDensityMultivariateNormal mu massesInv logDetMasses phi'
          kernelR = prPhi' / prPhi
       in return (fromVec x theta', kernelR, 1.0)
  where
    (HSettings _ toVec fromVec gradient mVal masses l e _) = st
    theta = toVec x
    lL = maximum [1 :: Int, floor $ (0.8 :: Double) * fromIntegral l]
    lR = maximum [lL, ceiling $ (1.2 :: Double) * fromIntegral l]
    eL = 0.8 * e
    eR = 1.2 * e
    (HData mu massesInv massesInvEps logDetMasses) = dt
    -- Vectorize the gradient and validation functions.
    gradientVec = toVec . gradient . fromVec x
    mValVec = mVal >>= (\f -> return $ f . fromVec x)

hamiltonianSimple ::
  HSettings a ->
  ProposalSimple a
hamiltonianSimple s = hamiltonianSimpleWithMemoizedCovariance s hd
  where
    hd = getHData s

-- If changed, also change help text of 'HTuneMasses'.
massMin :: Double
massMin = 1e-6

-- If changed, also change help text of 'HTuneMasses'.
massMax :: Double
massMax = 1e6

-- Minimal number of unique samples required for tuning the diagonal entries of
-- the mass matrix.
--
-- If changed, also change help text of 'HTuneMasses'.
samplesMinDiagonal :: Int
samplesMinDiagonal = 61

-- Minimal number of samples required for tuning all entries of the mass matrix.
--
-- If changed, also change help text of 'HTuneMasses'.
samplesMinAll :: Int
samplesMinAll = 201

getSampleSize :: VS.Vector Double -> Int
getSampleSize = VS.length . VS.uniq . S.gsort

-- Diagonal elements are variances which are strictly positive.
getNewMassDiagonalWithRescue :: Int -> Double -> Double -> Double
getNewMassDiagonalWithRescue sampleSize massOld massEstimate
  | sampleSize < samplesMinDiagonal = massOld
  -- NaN and negative masses could be errors.
  | isNaN massEstimate = massOld
  | massEstimate <= 0 = massOld
  | massMin > massNew = massMin
  | massNew > massMax = massMax
  | otherwise = massNew
  where
    massNewSqrt = recip 3 * (sqrt massOld + 2 * sqrt massEstimate)
    massNew = massNewSqrt ** 2

-- NOTE: Here, we lose time because we convert the states to vectors again,
-- something that has already been done. But then, auto tuning is not a runtime
-- determining factor.
tuneDiagonalMassesOnly ::
  Int ->
  (a -> Positions) ->
  AuxiliaryTuningFunction a
tuneDiagonalMassesOnly dim toVec xs ts
  -- If not enough data is available, do not tune.
  | VB.length xs < samplesMinDiagonal = ts
  | otherwise =
      -- Replace the diagonal.
      massesToTuningParameters $
        L.trustSym $ massesOld - L.diag massesDiagonalOld + L.diag massesDiagonalNew
  where
    -- xs: Each vector entry contains all parameter values of one iteration.
    -- xs': Each row contains all parameter values of one iteration.
    xs' = L.fromRows $ VB.toList $ VB.map toVec xs
    sampleSizes = VS.fromList $ map getSampleSize $ L.toColumns xs'
    massesOld = L.unSym $ tuningParametersToMasses dim ts
    massesDiagonalOld = L.takeDiag massesOld
    massesDiagonalEstimate = VS.fromList $ map (recip . S.variance) $ L.toColumns xs'
    massesDiagonalNew =
      VS.zipWith3
        getNewMassDiagonalWithRescue
        sampleSizes
        massesDiagonalOld
        massesDiagonalEstimate

-- NOTE: Here, we lose time because we convert the states to vectors again,
-- something that has already been done. But then, auto tuning is not a runtime
-- determining factor.
tuneAllMasses ::
  Int ->
  (a -> Positions) ->
  AuxiliaryTuningFunction a
tuneAllMasses dim toVec xs ts
  -- If not enough data is available, do not tune.
  | VB.length xs < samplesMinDiagonal = ts
  -- If not enough data is available, only the diagonal masses are tuned.
  | VB.length xs < samplesMinAll = fallbackDiagonal
  | L.rank xs' /= dim = fallbackDiagonal
  | otherwise = massesToTuningParameters $ L.trustSym massesNew
  where
    fallbackDiagonal = tuneDiagonalMassesOnly dim toVec xs ts
    -- xs: Each vector entry contains all parameter values of one iteration.
    -- xs': Each row contains all parameter values of one iteration.
    xs' = L.fromRows $ VB.toList $ VB.map toVec xs
    (_, ss, xsNormalized) = S.scale xs'
    -- sigmaNormalized = L.unSym $ either error id $ S.oracleApproximatingShrinkage xsNormalized
    sigmaNormalized = L.unSym $ either error fst $ S.graphicalLasso 0.5 xsNormalized
    sigma = S.rescaleSWith ss sigmaNormalized
    massesNew = L.inv sigma

-- | Hamiltonian Monte Carlo proposal.
hamiltonian ::
  Eq a =>
  HSettings a ->
  PName ->
  PWeight ->
  Proposal a
hamiltonian s n w = case checkHSettings s of
  Just err -> error err
  Nothing ->
    let desc = PDescription "Hamiltonian Monte Carlo (HMC)"
        toVec = hToVector s
        dim = (L.size $ toVec $ hSample s)
        pDim = PSpecial dim 0.65
        ts = massesToTuningParameters (hMasses s)
        ps = hamiltonianSimple s
        hamiltonianWith = Proposal n desc PSlow pDim w ps
        tSet@(HTune tlf tms) = hTune s
        tFun = case tlf of
          HNoTuneLeapfrog -> noTuningFunction
          HTuneLeapfrog -> defaultTuningFunctionWith pDim
        tFunAux = case tms of
          HNoTuneMasses -> noAuxiliaryTuningFunction
          HTuneDiagonalMassesOnly -> tuneDiagonalMassesOnly dim toVec
          HTuneAllMasses -> tuneAllMasses dim toVec
     in case tSet of
          (HTune HNoTuneLeapfrog HNoTuneMasses) -> hamiltonianWith Nothing
          _ ->
            let tuner = Tuner 1.0 tFun ts tFunAux (hamiltonianSimpleWithTuningParameters s)
             in hamiltonianWith $ Just tuner