-- |
-- Module      : ConClusion.Numeric.Statistics
-- Description : Statistical Functions
-- Copyright   : Phillip Seeber, 2021
-- License     : AGPL-3
-- Maintainer  : phillip.seeber@googlemail.com
-- Stability   : experimental
-- Portability : POSIX, Windows
module ConClusion.Numeric.Statistics
  ( -- * PCA
    PCA (..),
    pca,

    -- * Variance
    normalise,
    meanDeviation,
    covariance,

    -- * Distance Metrics
    DistFn,
    lpNorm,
    manhattan,
    euclidean,
    mahalanobis,

    -- * Cluster Algorithms
    Clusters,

    -- ** DBScan
    DistanceInvalidException (..),
    dbscan,

    -- ** Hierarchical Cluster Analysis
    Dendrogram,
    JoinStrat (..),
    hca,
    cutDendroAt,
  )
where

import ConClusion.Numeric.Data hiding (normalise)
import Data.Aeson hiding (Array)
import Data.Complex
import qualified Data.IntSet as IntSet
import Data.Massiv.Array as Massiv
import Data.Massiv.Array.Unsafe as Massiv
import qualified Data.PSQueue as PQ
import qualified Numeric.LinearAlgebra as LA
import RIO hiding (Vector)
import System.IO.Unsafe (unsafePerformIO)

----------------------------------------------------------------------------------------------------
-- Others/Helpers

-- | Solves eigenvalue problem of a square matrix and obtains its eigenvalues and eigenvectors.
{-# SCC eig #-}
eig ::
  ( Mutable r1 Ix1 (Complex Double),
    Mutable r2 Ix1 (Complex Double),
    LA.Field e,
    Manifest r3 Ix1 e,
    Resize r3 Ix2,
    Load r3 Ix2 e,
    MonadThrow m
  ) =>
  Matrix r3 e ->
  m (Vector r1 (Complex Double), Matrix r2 (Complex Double))
eig covM
  | m /= n = throwM $ IndexException "eigenvalue problems can only be solved for square matrix"
  | otherwise = return . bimap vecH2M matH2M . LA.eig $ cov
  where
    Sz (m :. n) = size covM
    cov = matM2H covM

-- | Sort eigenvalues and eigenvectors by magnitude of the eigenvalues in descending order (largest
-- eigenvalues first). Eigenvectors are the columns of the input matrix.
{-# SCC eigSort #-}
eigSort ::
  ( Load r2 Ix2 e,
    MonadThrow m,
    Source r1 Ix1 e,
    Source r2 Ix2 e,
    Mutable r1 Ix1 e,
    Mutable r2 Ix2 e,
    Unbox e,
    Ord e
  ) =>
  (Vector r1 e, Matrix r2 e) ->
  m (Vector r1 e, Matrix r2 e)
eigSort (vec, mat)
  | m /= n = throwM $ IndexException "matrix of the eigenvectors is not a square matrix"
  | n /= n' = throwM $ IndexException "different number of eigenvalues and eigenvectors"
  | otherwise = do
    let ixedEigenvalues = Massiv.zip vec ixVec
        (eigenValSortAsc, ixSort) = (\a -> (get fst a, get snd a)) . quicksort . compute @U $ ixedEigenvalues
        eigenVecSortAsc = backpermute' (Sz $ m :. n) (\(r :. c) -> r :. (ixSort ! c)) mat
        eigenValSort = reverse' (Dim 1) eigenValSortAsc
        eigenVecSort = reverse' (Dim 1) eigenVecSortAsc
    return (compute eigenValSort, compute eigenVecSort)
  where
    Sz (m :. n) = size mat
    Sz n' = size vec
    ixVec = makeArrayLinear @D Seq (Sz n') id
    get acc = compute @U . Massiv.map acc

----------------------------------------------------------------------------------------------------
-- Principal Component Analysis

data PCA = PCA
  { -- | Original feature matrix.
    x :: Matrix U Double,
    -- | Feature matrix in mean deviation form.
    x' :: Matrix U Double,
    -- | Transformed data.
    y :: Matrix U Double,
    -- | Transformation matrix to transform feature matrix into PCA result matrix.
    a :: Matrix U Double,
    -- | Mean squared error introduced by PCA.
    mse :: Double,
    -- | Percentage of the behaviour captured in the remaining dimensions.
    remaining :: Double,
    -- | All eigenvalues from the diagonalisation of the covariance matrix.
    allEigenValues :: Vector U Double,
    -- | Eigenvalues that were kept for PCA.
    pcaEigenValues :: Vector U Double,
    -- | All eigenvectors from the diagonalisation of the covariance matrix.
    allEigenVecs :: Matrix U Double,
    -- | Eigenvectors that were kept for PCA.
    pcaEigenVecs :: Matrix U Double
  }

-- | Transform the input values with a transformation matrix \(\mathbf{A}\), where \(\mathbf{A}\) is
-- constructed from the eigenvectors associated to the largest eigenvalues.
{-# SCC transformToPCABasis #-}
transformToPCABasis ::
  ( Source (R r) Ix2 e,
    Extract r Ix2 e,
    Mutable r Ix2 e,
    Numeric r e,
    MonadThrow m
  ) =>
  -- | Number of dimensions to keep from PCA.
  Int ->
  -- | Matrix of the eigenvectors, sorted descendingly by eigenvalues, where the eigenvectors are
  -- the columns of the matrix.
  Matrix r e ->
  -- | Feature matrix in mean deviation form.
  Matrix r e ->
  -- | Input data transformed by PCA to lower dimensions, and the transformation matrix
  -- \(\mathbf{A}\).
  m (Matrix r e, Matrix r e)
transformToPCABasis nDim eigenVecMat featureMat
  | mE /= nE = throwM $ IndexException "the matrix of the eigenvectors must be a quadratic matrix"
  | nDim <= 0 = throwM $ IndexException "the number of dimensions of the PCA is smaller than or zero"
  | nDim >= nE = throwM $ IndexException "more than the possible amount of dimensions has been selected"
  | mE /= mF = throwM $ IndexException "eigenvector matrix and feature matrix have mismatching dimensions"
  | otherwise = do
    matA <- compute . transpose <$> extractM (0 :. 0) (Sz $ mE :. nDim) eigenVecMat
    pcaData <- matA .><. featureMat
    return (pcaData, matA)
  where
    Sz (mE :. nE) = size eigenVecMat
    Sz (mF :. _nF) = size featureMat

-- | Performs a PCA on the feature matrix \(\mathbf{X}\) by solving the eigenproblem of the
-- covariance matrix. The function takes the feature matrix directly and perfoms the conversion
-- to mean deviation form, the calculation of the covariance matrix and the eigenvalue problem
-- automatically.
{-# SCC pca #-}
pca ::
  ( Numeric r Double,
    Mutable r Ix2 Double,
    Manifest r Ix1 Double,
    Source (R r) Ix2 Double,
    Extract r Ix2 Double,
    MonadThrow m
  ) =>
  -- | Dimensionalty after PCA transformation.
  Int ->
  -- | \(m \times n\) Feaute matrix \(\mathbf{X}\), with \(m\) different measurements (rows) in
  -- \(n\) different trials (columns).
  Matrix r Double ->
  m PCA
pca dim x = do
  -- Calculate the mean deviation form of the feature matrix and the covariance matrix from it.
  let x' = normalise . meanDeviation $ x
      cov = covariance x'

  -- Obtain eigenvalues and eigenvectors of the covariance matrix and sort them.
  (eigValsC :: Vector U (Complex Double), eigVecsC :: Matrix U (Complex Double)) <- eig cov
  let eigValsR = compute @U . Massiv.map realPart $ eigValsC
      eigVecsR = compute . Massiv.map realPart $ eigVecsC
  (eValS, eVecS) <- eigSort (eigValsR, eigVecsR)

  -- Use the subset of the eigenvectors with the largest eigenvalues to transform the features in
  -- mean deviation form into the result matrix Y.
  (pcaData, matA) <- transformToPCABasis dim eVecS x'

  -- Reconstuct the original data from lower dimensions and calculate the mean squared deviation.
  reconstructX <- (compute . transpose $ matA) .><. pcaData
  mse <- (/ fromIntegral n) . Massiv.sum . Massiv.map (** 2) <$> (x' .-. reconstructX)

  -- For output give the eigenvalues and eigenvectors that were kept.
  pcaEigenValues <- extractM 0 (Sz dim) eValS
  pcaEigenVecs <- extractM (0 :. 0) (Sz $ m :. dim) eVecS

  -- Calculate the amount of behaviour that could be kept.
  let remaining = (Massiv.sum pcaEigenValues / Massiv.sum eValS) * 100

  return
    PCA
      { x = compute x,
        x' = compute x',
        y = compute pcaData,
        a = compute matA,
        mse = mse,
        remaining = remaining,
        allEigenValues = eValS,
        pcaEigenValues = compute pcaEigenValues,
        allEigenVecs = compute eVecS,
        pcaEigenVecs = compute pcaEigenVecs
      }
  where
    Sz (m :. n) = size x

----------------------------------------------------------------------------------------------------
-- Variance

-- | Subtract the mean value of all columns from the feature matrix. Brings the feature matrix to
-- mean deviation form.
{-# SCC meanDeviation #-}
meanDeviation ::
  ( Source r Ix2 e,
    Fractional e,
    Unbox e,
    Numeric r e,
    Mutable r Ix2 e
  ) =>
  Matrix r e ->
  Matrix r e
meanDeviation mat = mat !-! compute meanMat
  where
    Sz (_ :. n) = Massiv.size mat
    featueMean = Massiv.foldlInner (+) 0 mat .* (1 / fromIntegral n)
    meanMat = expandInner (Sz n) const . compute @U $ featueMean

-- | Obtains the covariance matrix \(\mathbf{C_X}\) from the feature matrix \(\mathbf{X}\).
-- \[
--   \mathbf{C_X} \equiv \frac{1}{n - 1} \mathbf{X} \mathbf{X}^T
-- \]
-- where \(n\) is the number of columns in the matrix.
--
-- The feature matrix should be in mean deviation form, see 'meanDeviation'.
{-# SCC covariance #-}
covariance :: (Numeric r e, Mutable r Ix2 e, Fractional e) => Matrix r e -> Matrix r e
covariance x = (1 / (fromIntegral n - 1)) *. (x !><! (compute . transpose $ x))
  where
    Sz (_ :. n) = size x

-- | Normalise each value so that the maximum absolute value in each row becomes one.
normalise ::
  ( Ord e,
    Unbox e,
    Numeric r e,
    Fractional e,
    Source r Ix2 e,
    Mutable r Ix2 e
  ) =>
  Array r Ix2 e ->
  Array r Ix2 e
normalise mat =
  let absMat = Massiv.map abs mat
      maxPerRow = compute @U . foldlInner max 0 $ absMat
      divMat = compute . Massiv.map (1 /) . expandInner @Ix2 (Sz n) const $ maxPerRow
   in divMat !*! mat
  where
    Sz (_ :. n) = size mat

----------------------------------------------------------------------------------------------------
-- Distance Measures

-- | Distance matrix generator functions.
type DistFn r e = Matrix r e -> Matrix r e

-- | Builds the distance measures in a permutation matrix/distance matrix.
buildDistMat ::
  (Mutable r Ix2 e) =>
  -- | Zip function to combine the elements of vectors \(\mathbf{a}\) \(\mathbf{b}\). Usually @(-)@.
  -- \( f(\mathbf{a}_i, \mathbf{b}_i) = \mathbf{c} \)
  (e -> e -> a) ->
  -- | Fold the vector \(\mathbf{c}\) elementwise to a distance \(d\).
  (a -> a -> a) ->
  -- | Accumulator of the fold function.
  a ->
  -- | \(m \times n\) matrix, with \(n\) \(m\)-dimensional points (column vectors of the matrix).
  Matrix r e ->
  -- | Resulting distance matrix.
  Matrix D a
buildDistMat zipFn foldFn acc mat =
  let a = transposeOuter . expandOuter @Ix3 (Sz n) const $ mat
      b = transposeInner a
      ab = Massiv.zipWith zipFn a b
      d = foldlInner foldFn acc ab
   in d
  where
    Sz (_ :. n) = size mat

-- | The \(L_p\) norm between two vectors. Generalisation of Manhattan and Euclidean distances.
-- \[
--   d(\mathbf{a}, \mathbf{b}) = \left( \sum \limits_{i=1}^n \lvert \mathbf{a}_i - \mathbf{b}_i \rvert ^p \right) ^ \frac{1}{p}
-- \]
{-# SCC lpNorm #-}
lpNorm :: (Mutable r Ix2 e, Floating e) => Int -> DistFn r e
lpNorm p = compute . buildDistMat zipFn foldFn 0
  where
    zipFn a b = (^ p) . abs $ a - b
    foldFn a b = (** (1 / fromIntegral p)) $ a + b

-- | The Manhattan distance between two vectors. Specialisation of the \(L_p\) norm for \(p = 1\).
-- \[
--   d(\mathbf{a}, \mathbf{b}) = \sum \limits_{i=1}^n \lvert \mathbf{a}_i - \mathbf{b}_i \rvert
-- \]
{-# SCC manhattan #-}
manhattan :: (Mutable r Ix2 e, Floating e) => DistFn r e
manhattan = lpNorm 1

-- | The Euclidean distance between two vectors. Specialisation of the \(L_p\) norm for \(p = 2\).
-- \[
--   d(\mathbf{a}, \mathbf{b}) = \sqrt{\sum \limits_{i=1}^n (\mathbf{a}_i - \mathbf{b}_i)^2}
-- \]
{-# SCC euclidean #-}
euclidean :: (Mutable r Ix2 e, Floating e) => DistFn r e
euclidean = lpNorm 2

-- | Mahalanobis distance between points. Suitable for non correlated axes.
-- \[
--   d(\mathbf{a}, \mathbf{b}) = \sqrt{(\mathbf{a} - \mathbf{b})^T \mathbf{S}^{-1} (\mathbf{a} - \mathbf{b})}
-- \]
-- where \(\mathbf{S}\) is the covariance matrix.
{-# SCC mahalanobis #-}
mahalanobis :: (Unbox e, Numeric r e, Mutable r Ix2 e, Mutable r Ix1 e, LA.Field e) => DistFn r e
mahalanobis points =
  let a = transposeOuter . expandOuter @Ix3 (Sz n) const $ points
      b = transposeInner a
      abDiff = compute @U $ a !-! b
      ixArray = makeArray @U @Ix2 @Ix2 Par (Sz $ n :. n) id
      distMat =
        Massiv.map
          ( \(x :. y) ->
              let ab = compute @U $ abDiff !> x !> y
               in ab ><! covInv !.! ab
          )
          ixArray
   in compute . Massiv.map sqrt $ distMat
  where
    Sz (_ :. n) = size points
    cov = covariance . meanDeviation $ points
    covInv = matH2M . LA.inv . matM2H $ cov

----------------------------------------------------------------------------------------------------
-- DBScan

-- | Exception for invalid search distances.
newtype DistanceInvalidException e = DistanceInvalidException e deriving (Show, Eq)

instance (Typeable e, Show e) => Exception (DistanceInvalidException e)

-- | Representation of clusters.
type Clusters = Vector B IntSet

-- | DBScan algorithm.
{-# SCC dbscan #-}
dbscan ::
  ( MonadThrow m,
    Ord e,
    Num e,
    Typeable e,
    Show e,
    Source r Ix2 e
  ) =>
  -- | Distance measure to build the distance matrix of all points.
  DistFn r e ->
  -- | Minimal number of members in a cluster.
  Int ->
  -- | Search radius \(\epsilon\)
  e ->
  -- | \(n\) \(m\)-dimensional data points as column vectors of a \(m \times n\) matrix.
  Matrix r e ->
  -- | Resulting clusters.
  m Clusters
dbscan distFn nPoints epsilon points
  | isEmpty points = throwM $ SizeEmptyException (Sz 0 :: Sz1)
  | nPoints < 1 = throwM $ SizeNegativeException (Sz nPoints)
  | epsilon <= 0 = throwM $ DistanceInvalidException epsilon
  | otherwise =
    let pointNeighbours = ifoldlInner collectNeighbours mempty distMat
        allClusters = joinOverlapping . compute @B $ pointNeighbours
        largeClusters = sfilter (\s -> IntSet.size s >= nPoints) allClusters
     in return $ compute largeClusters
  where
    -- The distance matrix in the measure chosen by the distance function.
    distMat = distFn points

    -- Function to collect the neighbours of a point within the search radius epsilon.
    {-# SCC collectNeighbours #-}
    collectNeighbours (_ :. n) acc d = if d <= epsilon then IntSet.insert n acc else acc

    -- Construct the overlap matrix of all clusters.
    compareSets :: (IntSet -> IntSet -> Bool) -> Vector B IntSet -> Matrix D Bool
    compareSets fn clVec =
      let a = expandOuter @Ix2 sz const clVec
          b = transpose a
          compareMat = Massiv.zipWith fn a b
       in compareMat
      where
        sz = size clVec

    -- Overlap matrix. Checks if two sets have any overlap. Sets do overlap with themself.
    overlap :: Vector B IntSet -> Matrix D Bool
    overlap = compareSets (\a b -> not $ IntSet.disjoint a b)

    -- Check if any set overlaps wiht **any** other set.
    anyOtherOverlap :: Vector B IntSet -> Bool
    anyOtherOverlap = Massiv.any (== True) . imap (\(m :. n) v -> if m == n then False else v) . overlap

    -- Check if two sets are identical. Sets are identical to themself.
    same :: Vector B IntSet -> Matrix D Bool
    same = compareSets (==)

    -- Join all overlapping clusters recursively.
    {-# SCC joinOverlapping #-}
    joinOverlapping :: Vector B IntSet -> Vector B IntSet
    joinOverlapping clVec =
      let -- The overlap matrix of the clusters.
          ovlpMat = compute @U . overlap $ clVec
          anyOvlp = anyOtherOverlap clVec

          -- Join all sets that have overlap but keep them redundantly (no reduction of the amount
          -- of clusters).
          joined =
            ifoldlInner
              (\(_ :. n) acc ovlp -> if ovlp then (clVec ! n) <> acc else acc)
              mempty
              ovlpMat

          -- Find all sets at different indices that are the same. This is an upper triangular
          -- matrix with the main diagonal being False.
          sameMat =
            compute @U
              . imap (\(m :. n) v -> if m >= n then False else v)
              . same
              . compute @B
              $ joined

          -- Remove all sets that are redundant. Redundancy is checked by two criteria:
          --   1. Is this cluster the same set of points as any other cluster? If yes, it is
          --      redundant.
          --   2. Is this cluster isolated it is not redundant.
          nonRed =
            compute @B
              . sifilter
                ( \ix _ ->
                    let sameAsAnyOther = Massiv.any (== True) $ sameMat !> ix
                     in not sameAsAnyOther
                )
              $ joined
       in if anyOvlp then joinOverlapping nonRed else clVec

----------------------------------------------------------------------------------------------------
-- Hierarchical Cluster Analysis

-- | Nodes of a dendrogram.
data DendroNode e = DendroNode
  { distance :: e,
    cluster :: IntSet
  }
  deriving (Eq, Show, Generic)

instance (FromJSON e) => FromJSON (DendroNode e)

instance (ToJSON e) => ToJSON (DendroNode e)

-- | A dendrogram as a binary tree.
newtype Dendrogram e = Dendrogram {unDendro :: BinTree (DendroNode e)}
  deriving (Show, Eq, Generic)

instance ToJSON e => ToJSON (Dendrogram e)

instance FromJSON e => FromJSON (Dendrogram e)

-- | An accumulator to finally build a dendrogram by a bottom-up algorithm. Not to be exposed in the
-- API.
type DendroAcc e = Vector B (Dendrogram e)

-- | Mutable version of the dendrogram accumulator.
type DendroAccM m e = MArray (PrimState m) B Ix1 (Dendrogram e)

-- | Cut a 'Dendrogram' at a given distance and obtain all clusters from it.
cutDendroAt :: Ord e => Dendrogram e -> e -> Clusters
cutDendroAt dendro dist =
  let nodes = takeLeafyBranchesWhile (\DendroNode {distance} -> distance >= dist) . unDendro $ dendro
      clusters = compute @B . Massiv.map cluster . compute @B $ nodes
   in clusters

-- | A strategy/distance measure for clusters.
data JoinStrat e
  = SingleLinkage
  | CompleteLinkage
  | Median
  | UPGMA
  | WPGMA
  | Centroid
  | Ward
  | LWFB e
  | LW e e e e
  deriving (Eq, Show)

-- | Lance Williams formula to update distances.
{-# SCC lanceWilliams #-}
lanceWilliams ::
  Fractional e =>
  -- | How to calculate distance between clusters of points.
  JoinStrat e ->
  -- | Number of points in cluster \(A\).
  Int ->
  -- | Number of points in cluster \(B\)
  Int ->
  -- | Number of points in cluster \(C\)
  Int ->
  -- | \(d(A, B)\)
  e ->
  -- | \(d(A, C)\)
  e ->
  -- | \(d(B, C)\)
  e ->
  -- | Updated distance \(D \(A \cup B, C\)
  e
lanceWilliams js nA nB nC dAB dAC dBC = alpha1 * dAC + alpha2 * dBC + beta * dAB + gamma * abs (dAC - dBC)
  where
    nA' = fromIntegral nA
    nB' = fromIntegral nB
    nC' = fromIntegral nC
    (alpha1, alpha2, beta, gamma) = case js of
      SingleLinkage -> (1 / 2, 1 / 2, 0, - 1 / 2)
      CompleteLinkage -> (1 / 2, 1 / 2, 0, 1 / 2)
      Median -> (1 / 2, 1 / 2, - 1 / 4, 0)
      UPGMA -> (nA' / (nA' + nB'), nB' / (nA' + nB'), 0, 0)
      WPGMA -> (1 / 2, 1 / 2, 0, 0)
      Centroid -> (nA' / (nA' + nB'), nB' / (nA' + nB'), - (nA' * nB') / ((nA' + nB') ^ (2 :: Int)), 0)
      Ward -> ((nA' + nC') / (nA' + nB' + nC'), (nA' + nC') / (nA' + nB' + nC'), - (nA' + nC') / (nA' + nB' + nC'), 0)
      LWFB b -> ((1 - b) / 2, (1 - b) / 2, b, 0)
      LW a b c d -> (a, b, c, d)

----------------------------------------------------------------------------------------------------
-- Müllner Generic Hierarchical Clustering

-- | A neighbourlist. At index @i@ of the vector it contains a tuple with the minimal distance of
-- this cluster to any other cluster and the index of the other cluster.
type Neighbourlist r e = Vector r (e, Ix1)

-- | A distance matrix.
type DistanceMatrix r e = Matrix r e

-- | Performance improved hierarchical clustering algorithm. @GENERIC_LINKAGE@ from figure 3,
-- <https://arxiv.org/pdf/1109.2378.pdf>.
{-# SCC hca #-}
hca ::
  ( MonadThrow m,
    Mutable r Ix1 e,
    Mutable r Ix2 e,
    Mutable r Ix1 (e, Ix1),
    Manifest (R r) Ix1 e,
    OuterSlice r Ix2 e,
    Ord e,
    Unbox e,
    Fractional e
  ) =>
  DistFn r e ->
  JoinStrat e ->
  Matrix r e ->
  m (Dendrogram e)
hca distFn joinStrat points
  | Massiv.isEmpty points = throw $ SizeEmptyException (Sz nPoints)
  | otherwise = do
    let -- The distance matrix from the points.
        distMat = distFn points

    -- Initial vector of nearest neighbour to each point.
    nNghbr <- nearestNeighbours distMat

    let -- Initial priority queue of points. Has the minimum distance of all points.
        pq = PQ.fromList . Massiv.toList . Massiv.imap (\k (d, _) -> k PQ.:-> d) $ nNghbr
        -- Set of points not joined yet. Initially all points.
        s = IntSet.fromDistinctAscList [0 .. nPoints - 1]
        -- Initial dendrogram accumulator. The vector of all points as their own cluster.
        dendroAcc =
          makeArray @B @Ix1
            Par
            (Sz nPoints)
            (\p -> Dendrogram . Leaf $ DendroNode {distance = 0, cluster = IntSet.singleton p})

    distMatM <- return . unsafePerformIO . thaw $ distMat
    nNghbrM <- return . unsafePerformIO . thaw $ nNghbr
    dendroAccM <- return . unsafePerformIO . thaw $ dendroAcc

    return . unsafePerformIO $ agglomerate joinStrat distMatM nNghbrM pq s dendroAccM
  where
    Sz (_mFeatures :. nPoints) = size points

-- | Agglomerative clustering by the improved generic linkage algorithm. This is the main loop
-- recursion L 10-43.
{-# SCC agglomerate #-}
agglomerate ::
  ( MonadThrow m,
    PrimMonad m,
    MonadUnliftIO m,
    PrimState m ~ RealWorld,
    Mutable r Ix2 e,
    OuterSlice r Ix2 e,
    Manifest (R r) Ix1 e,
    Mutable r Ix1 (e, Ix1),
    Fractional e,
    Ord e
  ) =>
  -- | Join strategy for clusters and therefore how to calculate cluster-cluster distances.
  JoinStrat e ->
  -- | Distance matrix.
  MArray (PrimState m) r Ix2 e ->
  -- | List of nearest neighbours for each point.
  MArray (PrimState m) r Ix1 (e, Ix1) ->
  -- | Priority queue with the distances as priorities and the cluster index as keys.
  PQ.PSQ Ix1 e ->
  -- | A set \(S\), that keeps track which clusters have already been joined.
  IntSet ->
  -- | Accumulator of the dendrogram. Should collapse to a singleton vector.
  DendroAccM m e ->
  -- | The final dendrogram, after all clusters have been joined.
  m (Dendrogram e)
agglomerate joinStrat distMat nNghbr pq s dendroAcc
  | IntSet.null s = throw $ IndexException "No clusters left. This must never happen."
  | otherwise = do
    -- Obtain candidates for the two clusters to join and the minimal distance in the priority queue.
    candidates <- getJoinCandidates nNghbr pq

    -- If the distance between a b is not the minimal distance that the priority queue has found, the
    -- neighbour list must be wrong and recalculated.
    (a, b, delta, nNghbrU1, pqU1) <- recalculateNghbr candidates s distMat nNghbr pq

    -- Remove the minimal element from the priority queue and join clusters a and b. The cluster
    -- accumulator is reduced in its size: a is removed and b is updated with the joined cluster.
    (newS, pqU2, newAcc) <- joinClusters a b delta s pqU1 dendroAcc

    -- Update the distance matrix in the row and column of b but not at (b,b) and not at (a,b) and
    -- (b,a).
    newDistMat <- updateDistMat joinStrat a b newS distMat newAcc

    -- Redirect neighbours to b, if they previously pointed to a.
    nNghbrU2 <- redirectNeighbours a b newS newDistMat nNghbrU1

    -- Update the neighbourlist and priority queue with the new distances to b.
    (newNNghbr, newPQ) <- updateBNeighbour b s newDistMat nNghbrU2 pqU2

    -- If the problem has been reduced to a single cluster the algorithm is done and the final
    -- dendrogram can be obtained from the accumulator at index b. Otherwise join further.
    if IntSet.size newS == 1
      then newAcc `readM` b
      else agglomerate joinStrat newDistMat newNNghbr newPQ newS newAcc

-- | Obtain candidates for the clusters to join by looking at the minimal distance in the priority
-- queue and the neighbourlist. L 11-13
{-# SCC getJoinCandidates #-}
getJoinCandidates ::
  ( MonadThrow m,
    PrimMonad m,
    Mutable r Ix1 (e, Ix1),
    Ord e
  ) =>
  MArray (PrimState m) r Ix1 (e, Ix1) ->
  PQ.PSQ Ix1 e ->
  m (Ix1, Ix1, e)
getJoinCandidates nNghbr pq = do
  (a PQ.:-> d) <- case PQ.findMin pq of
    Nothing -> throwM $ IndexException "Empty priority queue"
    Just v -> return v
  (_, b) <- nNghbr `readM` a
  return (a, b, d)

-- | If the minimal distance @d@ found is not the distance between @a@ and @b@ recalculate the
-- neighbour list, update the priority queue and obtain a new set of a,b and a distance between them.
-- L 14-20.
{-# SCC recalculateNghbr #-}
recalculateNghbr ::
  ( MonadThrow m,
    PrimMonad m,
    MonadUnliftIO m,
    PrimState m ~ RealWorld,
    OuterSlice r Ix2 e,
    Manifest (R r) Ix1 e,
    Mutable r Ix1 (e, Ix1),
    Mutable r Ix2 e,
    Ord e
  ) =>
  (Ix1, Ix1, e) ->
  IntSet ->
  MArray (PrimState m) r Ix2 e ->
  MArray (PrimState m) r Ix1 (e, Ix1) ->
  PQ.PSQ Ix1 e ->
  m (Ix1, Ix1, e, MArray (PrimState m) r Ix1 (e, Ix1), PQ.PSQ Ix1 e)
recalculateNghbr (cA, cB, d) s distMat nNghbr pq = do
  dAB <- distMat `readM` (cA :. cB)
  if d == dAB
    then return (cA, cB, d, nNghbr, pq)
    else do
      -- Recalculate the nearest neighbours just on index cA. Consider only clusters, that were not
      -- merged yet.
      dmRowA <- searchRow cA s distMat >>= unsafeFreeze Par
      newNeighbourA@(minDistA, _) <- minimumM dmRowA
      writeM nNghbr cA newNeighbourA

      -- Update the priority queue at key cA with the new distance.
      let newPQ = PQ.adjust (const minDistA) cA pq

      -- Determine new a, b and d from the updated neighbour list and priority queue.
      (a PQ.:-> newD) <- case PQ.findMin newPQ of
        Nothing -> throwM $ IndexException "Empty priority queue"
        Just v -> return v
      (_, b) <- nNghbr `readM` a
      recalculateNghbr (a, b, newD) s distMat nNghbr newPQ

-- | Joins the selected clusters \(A\) and \(B\) and updates the dendrogram accumulator at index b.
-- A will not be removed so that the accumulator never shrinks.
-- L 21-24
{-# SCC joinClusters #-}
joinClusters ::
  ( MonadThrow m,
    PrimMonad m,
    Ord e
  ) =>
  Ix1 ->
  Ix1 ->
  e ->
  IntSet ->
  PQ.PSQ Ix1 e ->
  DendroAccM m e ->
  m (IntSet, PQ.PSQ Ix1 e, DendroAccM m e)
joinClusters a b d s pq acc = do
  clA <- acc `readM` a
  let newPQ = PQ.deleteMin pq
  modifyM_
    acc
    ( \clB ->
        return
          . Dendrogram
          $ Node
            ( DendroNode
                { distance = d,
                  cluster = (cluster . root . unDendro $ clA) <> (cluster . root . unDendro $ clB)
                }
            )
            (unDendro clA)
            (unDendro clB)
    )
    b
  let newS = IntSet.delete a s
  return (newS, newPQ, acc)

-- | Update the distance matrix with a Lance-Williams update in the rows and columns of cluster b.
-- L 25-27
{-# SCC updateDistMat #-}
updateDistMat ::
  ( MonadThrow m,
    PrimMonad m,
    MonadUnliftIO m,
    Mutable r Ix2 e,
    Fractional e
  ) =>
  JoinStrat e ->
  Ix1 ->
  Ix1 ->
  IntSet ->
  MArray (PrimState m) r Ix2 e ->
  DendroAccM m e ->
  m (MArray (PrimState m) r Ix2 e)
updateDistMat js a b s distMat dendroAcc
  | nDM /= nDM = throwM $ SizeMismatchException (Sz nDM) (Sz nCl)
  | mDM /= nDM = throwM $ SizeMismatchException (Sz mDM) (Sz nDM)
  | otherwise = do
    dAB <- distMat `readM` (a :. b)
    nA <- clSize a
    nB <- clSize b
    forIO_ ixV $ \ix -> do
      dAX <- distMat `readM` (a :. ix)
      nX <- clSize ix
      modifyM_ distMat (\dBX -> return $ lanceWilliams js nA nB nX dAB dAX dBX) (ix :. b)
      modifyM_ distMat (\dBX -> return $ lanceWilliams js nA nB nX dAB dAX dBX) (b :. ix)
    return distMat
  where
    Sz (mDM :. nDM) = msize distMat
    Sz nCl = msize dendroAcc
    ixV = Massiv.fromList @U Par . IntSet.toAscList . IntSet.delete b $ s
    clSize i = IntSet.size . cluster . root . unDendro <$> dendroAcc `readM` i

-- | Updates the neighbourlist. All elements with a smaller index than a, that had a as a nearest
-- neighbour are blindly redirected to the union of a and b, now at index b.
-- L 28-32
{-# SCC redirectNeighbours #-}
redirectNeighbours ::
  ( MonadThrow m,
    PrimMonad m,
    MonadUnliftIO m,
    Mutable r Ix1 (e, Ix1),
    Mutable r Ix2 e
  ) =>
  Ix1 ->
  Ix1 ->
  IntSet ->
  MArray (PrimState m) r Ix2 e ->
  MArray (PrimState m) r Ix1 (e, Ix1) ->
  m (MArray (PrimState m) r Ix1 (e, Ix1))
redirectNeighbours a b s distMat nNghbr = do
  forIO_ ixV $ \ix ->
    modifyM_
      nNghbr
      ( \old@(_, nghbrX) ->
          if nghbrX == a
            then distMat `readM` (ix :. b) >>= \dXB -> return (dXB, b)
            else return old
      )
      ix
  return nNghbr
  where
    ixV = compute @U . sfilter (< a) . Massiv.fromList @U Par . IntSet.toAscList $ s

-- | Updates the list of nearest neighbours for all combinations that might have changed by
-- recalculation with the joined cluster AB at index b.
-- L
{-# SCC updateWithNewBDists #-}
updateWithNewBDists ::
  ( MonadThrow m,
    MonadUnliftIO m,
    PrimMonad m,
    Mutable r Ix2 e,
    Mutable r Ix1 (e, Ix1),
    Ord e
  ) =>
  Ix1 ->
  IntSet ->
  MArray (PrimState m) r Ix2 e ->
  MArray (PrimState m) r Ix1 (e, Ix1) ->
  PQ.PSQ Ix1 e ->
  m (MArray (PrimState m) r Ix1 (e, Ix1), PQ.PSQ Ix1 e)
updateWithNewBDists b s distMat nNghbr pq = do
  pqT <- newTVarIO pq
  forIO_ ixV $ \ix -> do
    dBX <- distMat `readM` (ix :. b)
    currentPQ <- readTVarIO pqT
    minDistX <- case PQ.lookup ix currentPQ of
      Nothing -> throwM $ IndexException "Empty priority queue."
      Just v -> return v
    if dBX < minDistX
      then do
        writeM nNghbr ix (dBX, b)
        atomically . writeTVar pqT . PQ.adjust (const dBX) ix $ currentPQ
      else atomically . writeTVar pqT $ currentPQ

  newPQ <- readTVarIO pqT
  return (nNghbr, newPQ)
  where
    ixV = compute @U . Massiv.sfilter (< b) . Massiv.fromList @U Par . IntSet.toAscList $ s

--  | Updates the list of nearest neighbours and the priority queue at key b.
-- L 39-40
{-# SCC updateBNeighbour #-}
updateBNeighbour ::
  ( MonadThrow m,
    PrimMonad m,
    MonadUnliftIO m,
    Mutable r Ix1 (e, Ix1),
    Mutable r Ix2 e,
    Ord e
  ) =>
  Ix1 ->
  IntSet ->
  MArray (PrimState m) r Ix2 e ->
  MArray (PrimState m) r Ix1 (e, Ix1) ->
  PQ.PSQ Ix1 e ->
  m (MArray (PrimState m) r Ix1 (e, Ix1), PQ.PSQ Ix1 e)
updateBNeighbour b s distMat nNghbr pq =
  if b >= nNeighbours
    then return (nNghbr, pq)
    else do
      rowAB <- searchRow b s distMat >>= unsafeFreeze Par
      newNeighbourB@(distB, neighbourB) <- minimumM rowAB
      writeM nNghbr b newNeighbourB
      let newPQ = PQ.adjust (const distB) neighbourB pq
      return (nNghbr, newPQ)
  where
    Sz nNeighbours = msize nNghbr

-- | Find the nearest neighbour for each point from a distance matrix. For each point it stores the
-- minimum distance and the index of the other point, that is the nearest neighbour but at a higher
-- index.
{-# SCC nearestNeighbours #-}
nearestNeighbours ::
  ( MonadThrow m,
    Mutable r Ix1 e,
    Mutable r Ix1 (e, Ix1),
    OuterSlice r Ix2 e,
    Source (R r) Ix1 e,
    Ord e,
    Unbox e
  ) =>
  Matrix r e ->
  m (Vector r (e, Ix1))
nearestNeighbours distMat
  | m /= n = throwM $ IndexException "Distance matrix is not square"
  | m == 0 = throwM $ IndexException "Distance matrix is empty"
  | otherwise =
    let rows = compute @B . outerSlices $ distMat
        minDistIx =
          Massiv.imap (\i v -> unsafePerformIO . minDistAtVec i . compute @U $ v) . init $ rows
     in return . compute $ minDistIx
  where
    Sz (m :. n) = size distMat

-- Make a search row for distances. Takes row x from a distance matrix and zips them with their
-- column index. Then keeps only the valid elements of the row, that are still part of the available
-- points. A minimum or maximum search can be performed on the resulting vector and a valid pair of
-- distance and index can be obtained.
searchRow ::
  ( PrimMonad m,
    MonadThrow m,
    MonadUnliftIO m,
    Mutable r Ix2 e,
    Mutable r Ix1 (e, Ix1)
  ) =>
  Ix1 ->
  IntSet ->
  MArray (PrimState m) r Ix2 e ->
  m (MArray (PrimState m) r Ix1 (e, Ix1))
searchRow x s dm =
  makeMArray Par (size ixV) $ \ix -> do
    dmIx <- ixV !? ix
    (dm `readM` (x :. dmIx)) >>= \dist -> return (dist, dmIx)
  where
    ixV :: Vector U Ix1
    ixV = compute @U . sfilter (> x) . Massiv.fromList @U Par . IntSet.toAscList $ s