-- | -- Module : ConClusion.Numeric.Statistics -- Description : Statistical Functions -- Copyright : Phillip Seeber, 2021 -- License : AGPL-3 -- Maintainer : phillip.seeber@googlemail.com -- Stability : experimental -- Portability : POSIX, Windows module ConClusion.Numeric.Statistics ( -- * PCA PCA (..), pca, -- * Variance normalise, meanDeviation, covariance, -- * Distance Metrics DistFn, lpNorm, manhattan, euclidean, mahalanobis, -- * Cluster Algorithms Clusters, -- ** DBScan DistanceInvalidException (..), dbscan, -- ** Hierarchical Cluster Analysis Dendrogram, JoinStrat (..), hca, cutDendroAt, ) where import ConClusion.Numeric.Data hiding (normalise) import Data.Aeson hiding (Array) import Data.Complex import qualified Data.IntSet as IntSet import Data.Massiv.Array as Massiv import Data.Massiv.Array.Unsafe as Massiv import qualified Data.PSQueue as PQ import qualified Numeric.LinearAlgebra as LA import RIO hiding (Vector) import System.IO.Unsafe (unsafePerformIO) ---------------------------------------------------------------------------------------------------- -- Others/Helpers -- | Solves eigenvalue problem of a square matrix and obtains its eigenvalues and eigenvectors. {-# SCC eig #-} eig :: ( Mutable r1 Ix1 (Complex Double), Mutable r2 Ix1 (Complex Double), LA.Field e, Manifest r3 Ix1 e, Resize r3 Ix2, Load r3 Ix2 e, MonadThrow m ) => Matrix r3 e -> m (Vector r1 (Complex Double), Matrix r2 (Complex Double)) eig covM | m /= n = throwM $ IndexException "eigenvalue problems can only be solved for square matrix" | otherwise = return . bimap vecH2M matH2M . LA.eig $ cov where Sz (m :. n) = size covM cov = matM2H covM -- | Sort eigenvalues and eigenvectors by magnitude of the eigenvalues in descending order (largest -- eigenvalues first). Eigenvectors are the columns of the input matrix. {-# SCC eigSort #-} eigSort :: ( Load r2 Ix2 e, MonadThrow m, Source r1 Ix1 e, Source r2 Ix2 e, Mutable r1 Ix1 e, Mutable r2 Ix2 e, Unbox e, Ord e ) => (Vector r1 e, Matrix r2 e) -> m (Vector r1 e, Matrix r2 e) eigSort (vec, mat) | m /= n = throwM $ IndexException "matrix of the eigenvectors is not a square matrix" | n /= n' = throwM $ IndexException "different number of eigenvalues and eigenvectors" | otherwise = do let ixedEigenvalues = Massiv.zip vec ixVec (eigenValSortAsc, ixSort) = (\a -> (get fst a, get snd a)) . quicksort . compute @U $ ixedEigenvalues eigenVecSortAsc = backpermute' (Sz $ m :. n) (\(r :. c) -> r :. (ixSort ! c)) mat eigenValSort = reverse' (Dim 1) eigenValSortAsc eigenVecSort = reverse' (Dim 1) eigenVecSortAsc return (compute eigenValSort, compute eigenVecSort) where Sz (m :. n) = size mat Sz n' = size vec ixVec = makeArrayLinear @D Seq (Sz n') id get acc = compute @U . Massiv.map acc ---------------------------------------------------------------------------------------------------- -- Principal Component Analysis data PCA = PCA { -- | Original feature matrix. x :: Matrix U Double, -- | Feature matrix in mean deviation form. x' :: Matrix U Double, -- | Transformed data. y :: Matrix U Double, -- | Transformation matrix to transform feature matrix into PCA result matrix. a :: Matrix U Double, -- | Mean squared error introduced by PCA. mse :: Double, -- | Percentage of the behaviour captured in the remaining dimensions. remaining :: Double, -- | All eigenvalues from the diagonalisation of the covariance matrix. allEigenValues :: Vector U Double, -- | Eigenvalues that were kept for PCA. pcaEigenValues :: Vector U Double, -- | All eigenvectors from the diagonalisation of the covariance matrix. allEigenVecs :: Matrix U Double, -- | Eigenvectors that were kept for PCA. pcaEigenVecs :: Matrix U Double } -- | Transform the input values with a transformation matrix \(\mathbf{A}\), where \(\mathbf{A}\) is -- constructed from the eigenvectors associated to the largest eigenvalues. {-# SCC transformToPCABasis #-} transformToPCABasis :: ( Source (R r) Ix2 e, Extract r Ix2 e, Mutable r Ix2 e, Numeric r e, MonadThrow m ) => -- | Number of dimensions to keep from PCA. Int -> -- | Matrix of the eigenvectors, sorted descendingly by eigenvalues, where the eigenvectors are -- the columns of the matrix. Matrix r e -> -- | Feature matrix in mean deviation form. Matrix r e -> -- | Input data transformed by PCA to lower dimensions, and the transformation matrix -- \(\mathbf{A}\). m (Matrix r e, Matrix r e) transformToPCABasis nDim eigenVecMat featureMat | mE /= nE = throwM $ IndexException "the matrix of the eigenvectors must be a quadratic matrix" | nDim <= 0 = throwM $ IndexException "the number of dimensions of the PCA is smaller than or zero" | nDim >= nE = throwM $ IndexException "more than the possible amount of dimensions has been selected" | mE /= mF = throwM $ IndexException "eigenvector matrix and feature matrix have mismatching dimensions" | otherwise = do matA <- compute . transpose <$> extractM (0 :. 0) (Sz $ mE :. nDim) eigenVecMat pcaData <- matA .><. featureMat return (pcaData, matA) where Sz (mE :. nE) = size eigenVecMat Sz (mF :. _nF) = size featureMat -- | Performs a PCA on the feature matrix \(\mathbf{X}\) by solving the eigenproblem of the -- covariance matrix. The function takes the feature matrix directly and perfoms the conversion -- to mean deviation form, the calculation of the covariance matrix and the eigenvalue problem -- automatically. {-# SCC pca #-} pca :: ( Numeric r Double, Mutable r Ix2 Double, Manifest r Ix1 Double, Source (R r) Ix2 Double, Extract r Ix2 Double, MonadThrow m ) => -- | Dimensionalty after PCA transformation. Int -> -- | \(m \times n\) Feaute matrix \(\mathbf{X}\), with \(m\) different measurements (rows) in -- \(n\) different trials (columns). Matrix r Double -> m PCA pca dim x = do -- Calculate the mean deviation form of the feature matrix and the covariance matrix from it. let x' = normalise . meanDeviation $ x cov = covariance x' -- Obtain eigenvalues and eigenvectors of the covariance matrix and sort them. (eigValsC :: Vector U (Complex Double), eigVecsC :: Matrix U (Complex Double)) <- eig cov let eigValsR = compute @U . Massiv.map realPart $ eigValsC eigVecsR = compute . Massiv.map realPart $ eigVecsC (eValS, eVecS) <- eigSort (eigValsR, eigVecsR) -- Use the subset of the eigenvectors with the largest eigenvalues to transform the features in -- mean deviation form into the result matrix Y. (pcaData, matA) <- transformToPCABasis dim eVecS x' -- Reconstuct the original data from lower dimensions and calculate the mean squared deviation. reconstructX <- (compute . transpose $ matA) .><. pcaData mse <- (/ fromIntegral n) . Massiv.sum . Massiv.map (** 2) <$> (x' .-. reconstructX) -- For output give the eigenvalues and eigenvectors that were kept. pcaEigenValues <- extractM 0 (Sz dim) eValS pcaEigenVecs <- extractM (0 :. 0) (Sz $ m :. dim) eVecS -- Calculate the amount of behaviour that could be kept. let remaining = (Massiv.sum pcaEigenValues / Massiv.sum eValS) * 100 return PCA { x = compute x, x' = compute x', y = compute pcaData, a = compute matA, mse = mse, remaining = remaining, allEigenValues = eValS, pcaEigenValues = compute pcaEigenValues, allEigenVecs = compute eVecS, pcaEigenVecs = compute pcaEigenVecs } where Sz (m :. n) = size x ---------------------------------------------------------------------------------------------------- -- Variance -- | Subtract the mean value of all columns from the feature matrix. Brings the feature matrix to -- mean deviation form. {-# SCC meanDeviation #-} meanDeviation :: ( Source r Ix2 e, Fractional e, Unbox e, Numeric r e, Mutable r Ix2 e ) => Matrix r e -> Matrix r e meanDeviation mat = mat !-! compute meanMat where Sz (_ :. n) = Massiv.size mat featueMean = Massiv.foldlInner (+) 0 mat .* (1 / fromIntegral n) meanMat = expandInner (Sz n) const . compute @U $ featueMean -- | Obtains the covariance matrix \(\mathbf{C_X}\) from the feature matrix \(\mathbf{X}\). -- \[ -- \mathbf{C_X} \equiv \frac{1}{n - 1} \mathbf{X} \mathbf{X}^T -- \] -- where \(n\) is the number of columns in the matrix. -- -- The feature matrix should be in mean deviation form, see 'meanDeviation'. {-# SCC covariance #-} covariance :: (Numeric r e, Mutable r Ix2 e, Fractional e) => Matrix r e -> Matrix r e covariance x = (1 / (fromIntegral n - 1)) *. (x !> Array r Ix2 e -> Array r Ix2 e normalise mat = let absMat = Massiv.map abs mat maxPerRow = compute @U . foldlInner max 0 $ absMat divMat = compute . Massiv.map (1 /) . expandInner @Ix2 (Sz n) const $ maxPerRow in divMat !*! mat where Sz (_ :. n) = size mat ---------------------------------------------------------------------------------------------------- -- Distance Measures -- | Distance matrix generator functions. type DistFn r e = Matrix r e -> Matrix r e -- | Builds the distance measures in a permutation matrix/distance matrix. buildDistMat :: (Mutable r Ix2 e) => -- | Zip function to combine the elements of vectors \(\mathbf{a}\) \(\mathbf{b}\). Usually @(-)@. -- \( f(\mathbf{a}_i, \mathbf{b}_i) = \mathbf{c} \) (e -> e -> a) -> -- | Fold the vector \(\mathbf{c}\) elementwise to a distance \(d\). (a -> a -> a) -> -- | Accumulator of the fold function. a -> -- | \(m \times n\) matrix, with \(n\) \(m\)-dimensional points (column vectors of the matrix). Matrix r e -> -- | Resulting distance matrix. Matrix D a buildDistMat zipFn foldFn acc mat = let a = transposeOuter . expandOuter @Ix3 (Sz n) const $ mat b = transposeInner a ab = Massiv.zipWith zipFn a b d = foldlInner foldFn acc ab in d where Sz (_ :. n) = size mat -- | The \(L_p\) norm between two vectors. Generalisation of Manhattan and Euclidean distances. -- \[ -- d(\mathbf{a}, \mathbf{b}) = \left( \sum \limits_{i=1}^n \lvert \mathbf{a}_i - \mathbf{b}_i \rvert ^p \right) ^ \frac{1}{p} -- \] {-# SCC lpNorm #-} lpNorm :: (Mutable r Ix2 e, Floating e) => Int -> DistFn r e lpNorm p = compute . buildDistMat zipFn foldFn 0 where zipFn a b = (^ p) . abs $ a - b foldFn a b = (** (1 / fromIntegral p)) $ a + b -- | The Manhattan distance between two vectors. Specialisation of the \(L_p\) norm for \(p = 1\). -- \[ -- d(\mathbf{a}, \mathbf{b}) = \sum \limits_{i=1}^n \lvert \mathbf{a}_i - \mathbf{b}_i \rvert -- \] {-# SCC manhattan #-} manhattan :: (Mutable r Ix2 e, Floating e) => DistFn r e manhattan = lpNorm 1 -- | The Euclidean distance between two vectors. Specialisation of the \(L_p\) norm for \(p = 2\). -- \[ -- d(\mathbf{a}, \mathbf{b}) = \sqrt{\sum \limits_{i=1}^n (\mathbf{a}_i - \mathbf{b}_i)^2} -- \] {-# SCC euclidean #-} euclidean :: (Mutable r Ix2 e, Floating e) => DistFn r e euclidean = lpNorm 2 -- | Mahalanobis distance between points. Suitable for non correlated axes. -- \[ -- d(\mathbf{a}, \mathbf{b}) = \sqrt{(\mathbf{a} - \mathbf{b})^T \mathbf{S}^{-1} (\mathbf{a} - \mathbf{b})} -- \] -- where \(\mathbf{S}\) is the covariance matrix. {-# SCC mahalanobis #-} mahalanobis :: (Unbox e, Numeric r e, Mutable r Ix2 e, Mutable r Ix1 e, LA.Field e) => DistFn r e mahalanobis points = let a = transposeOuter . expandOuter @Ix3 (Sz n) const $ points b = transposeInner a abDiff = compute @U $ a !-! b ixArray = makeArray @U @Ix2 @Ix2 Par (Sz $ n :. n) id distMat = Massiv.map ( \(x :. y) -> let ab = compute @U $ abDiff !> x !> y in ab > Exception (DistanceInvalidException e) -- | Representation of clusters. type Clusters = Vector B IntSet -- | DBScan algorithm. {-# SCC dbscan #-} dbscan :: ( MonadThrow m, Ord e, Num e, Typeable e, Show e, Source r Ix2 e ) => -- | Distance measure to build the distance matrix of all points. DistFn r e -> -- | Minimal number of members in a cluster. Int -> -- | Search radius \(\epsilon\) e -> -- | \(n\) \(m\)-dimensional data points as column vectors of a \(m \times n\) matrix. Matrix r e -> -- | Resulting clusters. m Clusters dbscan distFn nPoints epsilon points | isEmpty points = throwM $ SizeEmptyException (Sz 0 :: Sz1) | nPoints < 1 = throwM $ SizeNegativeException (Sz nPoints) | epsilon <= 0 = throwM $ DistanceInvalidException epsilon | otherwise = let pointNeighbours = ifoldlInner collectNeighbours mempty distMat allClusters = joinOverlapping . compute @B $ pointNeighbours largeClusters = sfilter (\s -> IntSet.size s >= nPoints) allClusters in return $ compute largeClusters where -- The distance matrix in the measure chosen by the distance function. distMat = distFn points -- Function to collect the neighbours of a point within the search radius epsilon. {-# SCC collectNeighbours #-} collectNeighbours (_ :. n) acc d = if d <= epsilon then IntSet.insert n acc else acc -- Construct the overlap matrix of all clusters. compareSets :: (IntSet -> IntSet -> Bool) -> Vector B IntSet -> Matrix D Bool compareSets fn clVec = let a = expandOuter @Ix2 sz const clVec b = transpose a compareMat = Massiv.zipWith fn a b in compareMat where sz = size clVec -- Overlap matrix. Checks if two sets have any overlap. Sets do overlap with themself. overlap :: Vector B IntSet -> Matrix D Bool overlap = compareSets (\a b -> not $ IntSet.disjoint a b) -- Check if any set overlaps wiht **any** other set. anyOtherOverlap :: Vector B IntSet -> Bool anyOtherOverlap = Massiv.any (== True) . imap (\(m :. n) v -> if m == n then False else v) . overlap -- Check if two sets are identical. Sets are identical to themself. same :: Vector B IntSet -> Matrix D Bool same = compareSets (==) -- Join all overlapping clusters recursively. {-# SCC joinOverlapping #-} joinOverlapping :: Vector B IntSet -> Vector B IntSet joinOverlapping clVec = let -- The overlap matrix of the clusters. ovlpMat = compute @U . overlap $ clVec anyOvlp = anyOtherOverlap clVec -- Join all sets that have overlap but keep them redundantly (no reduction of the amount -- of clusters). joined = ifoldlInner (\(_ :. n) acc ovlp -> if ovlp then (clVec ! n) <> acc else acc) mempty ovlpMat -- Find all sets at different indices that are the same. This is an upper triangular -- matrix with the main diagonal being False. sameMat = compute @U . imap (\(m :. n) v -> if m >= n then False else v) . same . compute @B $ joined -- Remove all sets that are redundant. Redundancy is checked by two criteria: -- 1. Is this cluster the same set of points as any other cluster? If yes, it is -- redundant. -- 2. Is this cluster isolated it is not redundant. nonRed = compute @B . sifilter ( \ix _ -> let sameAsAnyOther = Massiv.any (== True) $ sameMat !> ix in not sameAsAnyOther ) $ joined in if anyOvlp then joinOverlapping nonRed else clVec ---------------------------------------------------------------------------------------------------- -- Hierarchical Cluster Analysis -- | Nodes of a dendrogram. data DendroNode e = DendroNode { distance :: e, cluster :: IntSet } deriving (Eq, Show, Generic) instance (FromJSON e) => FromJSON (DendroNode e) instance (ToJSON e) => ToJSON (DendroNode e) -- | A dendrogram as a binary tree. newtype Dendrogram e = Dendrogram {unDendro :: BinTree (DendroNode e)} deriving (Show, Eq, Generic) instance ToJSON e => ToJSON (Dendrogram e) instance FromJSON e => FromJSON (Dendrogram e) -- | An accumulator to finally build a dendrogram by a bottom-up algorithm. Not to be exposed in the -- API. type DendroAcc e = Vector B (Dendrogram e) -- | Mutable version of the dendrogram accumulator. type DendroAccM m e = MArray (PrimState m) B Ix1 (Dendrogram e) -- | Cut a 'Dendrogram' at a given distance and obtain all clusters from it. cutDendroAt :: Ord e => Dendrogram e -> e -> Clusters cutDendroAt dendro dist = let nodes = takeLeafyBranchesWhile (\DendroNode {distance} -> distance >= dist) . unDendro $ dendro clusters = compute @B . Massiv.map cluster . compute @B $ nodes in clusters -- | A strategy/distance measure for clusters. data JoinStrat e = SingleLinkage | CompleteLinkage | Median | UPGMA | WPGMA | Centroid | Ward | LWFB e | LW e e e e deriving (Eq, Show) -- | Lance Williams formula to update distances. {-# SCC lanceWilliams #-} lanceWilliams :: Fractional e => -- | How to calculate distance between clusters of points. JoinStrat e -> -- | Number of points in cluster \(A\). Int -> -- | Number of points in cluster \(B\) Int -> -- | Number of points in cluster \(C\) Int -> -- | \(d(A, B)\) e -> -- | \(d(A, C)\) e -> -- | \(d(B, C)\) e -> -- | Updated distance \(D \(A \cup B, C\) e lanceWilliams js nA nB nC dAB dAC dBC = alpha1 * dAC + alpha2 * dBC + beta * dAB + gamma * abs (dAC - dBC) where nA' = fromIntegral nA nB' = fromIntegral nB nC' = fromIntegral nC (alpha1, alpha2, beta, gamma) = case js of SingleLinkage -> (1 / 2, 1 / 2, 0, - 1 / 2) CompleteLinkage -> (1 / 2, 1 / 2, 0, 1 / 2) Median -> (1 / 2, 1 / 2, - 1 / 4, 0) UPGMA -> (nA' / (nA' + nB'), nB' / (nA' + nB'), 0, 0) WPGMA -> (1 / 2, 1 / 2, 0, 0) Centroid -> (nA' / (nA' + nB'), nB' / (nA' + nB'), - (nA' * nB') / ((nA' + nB') ^ (2 :: Int)), 0) Ward -> ((nA' + nC') / (nA' + nB' + nC'), (nA' + nC') / (nA' + nB' + nC'), - (nA' + nC') / (nA' + nB' + nC'), 0) LWFB b -> ((1 - b) / 2, (1 - b) / 2, b, 0) LW a b c d -> (a, b, c, d) ---------------------------------------------------------------------------------------------------- -- Müllner Generic Hierarchical Clustering -- | A neighbourlist. At index @i@ of the vector it contains a tuple with the minimal distance of -- this cluster to any other cluster and the index of the other cluster. type Neighbourlist r e = Vector r (e, Ix1) -- | A distance matrix. type DistanceMatrix r e = Matrix r e -- | Performance improved hierarchical clustering algorithm. @GENERIC_LINKAGE@ from figure 3, -- . {-# SCC hca #-} hca :: ( MonadThrow m, Mutable r Ix1 e, Mutable r Ix2 e, Mutable r Ix1 (e, Ix1), Manifest (R r) Ix1 e, OuterSlice r Ix2 e, Ord e, Unbox e, Fractional e ) => DistFn r e -> JoinStrat e -> Matrix r e -> m (Dendrogram e) hca distFn joinStrat points | Massiv.isEmpty points = throw $ SizeEmptyException (Sz nPoints) | otherwise = do let -- The distance matrix from the points. distMat = distFn points -- Initial vector of nearest neighbour to each point. nNghbr <- nearestNeighbours distMat let -- Initial priority queue of points. Has the minimum distance of all points. pq = PQ.fromList . Massiv.toList . Massiv.imap (\k (d, _) -> k PQ.:-> d) $ nNghbr -- Set of points not joined yet. Initially all points. s = IntSet.fromDistinctAscList [0 .. nPoints - 1] -- Initial dendrogram accumulator. The vector of all points as their own cluster. dendroAcc = makeArray @B @Ix1 Par (Sz nPoints) (\p -> Dendrogram . Leaf $ DendroNode {distance = 0, cluster = IntSet.singleton p}) distMatM <- return . unsafePerformIO . thaw $ distMat nNghbrM <- return . unsafePerformIO . thaw $ nNghbr dendroAccM <- return . unsafePerformIO . thaw $ dendroAcc return . unsafePerformIO $ agglomerate joinStrat distMatM nNghbrM pq s dendroAccM where Sz (_mFeatures :. nPoints) = size points -- | Agglomerative clustering by the improved generic linkage algorithm. This is the main loop -- recursion L 10-43. {-# SCC agglomerate #-} agglomerate :: ( MonadThrow m, PrimMonad m, MonadUnliftIO m, PrimState m ~ RealWorld, Mutable r Ix2 e, OuterSlice r Ix2 e, Manifest (R r) Ix1 e, Mutable r Ix1 (e, Ix1), Fractional e, Ord e ) => -- | Join strategy for clusters and therefore how to calculate cluster-cluster distances. JoinStrat e -> -- | Distance matrix. MArray (PrimState m) r Ix2 e -> -- | List of nearest neighbours for each point. MArray (PrimState m) r Ix1 (e, Ix1) -> -- | Priority queue with the distances as priorities and the cluster index as keys. PQ.PSQ Ix1 e -> -- | A set \(S\), that keeps track which clusters have already been joined. IntSet -> -- | Accumulator of the dendrogram. Should collapse to a singleton vector. DendroAccM m e -> -- | The final dendrogram, after all clusters have been joined. m (Dendrogram e) agglomerate joinStrat distMat nNghbr pq s dendroAcc | IntSet.null s = throw $ IndexException "No clusters left. This must never happen." | otherwise = do -- Obtain candidates for the two clusters to join and the minimal distance in the priority queue. candidates <- getJoinCandidates nNghbr pq -- If the distance between a b is not the minimal distance that the priority queue has found, the -- neighbour list must be wrong and recalculated. (a, b, delta, nNghbrU1, pqU1) <- recalculateNghbr candidates s distMat nNghbr pq -- Remove the minimal element from the priority queue and join clusters a and b. The cluster -- accumulator is reduced in its size: a is removed and b is updated with the joined cluster. (newS, pqU2, newAcc) <- joinClusters a b delta s pqU1 dendroAcc -- Update the distance matrix in the row and column of b but not at (b,b) and not at (a,b) and -- (b,a). newDistMat <- updateDistMat joinStrat a b newS distMat newAcc -- Redirect neighbours to b, if they previously pointed to a. nNghbrU2 <- redirectNeighbours a b newS newDistMat nNghbrU1 -- Update the neighbourlist and priority queue with the new distances to b. (newNNghbr, newPQ) <- updateBNeighbour b s newDistMat nNghbrU2 pqU2 -- If the problem has been reduced to a single cluster the algorithm is done and the final -- dendrogram can be obtained from the accumulator at index b. Otherwise join further. if IntSet.size newS == 1 then newAcc `readM` b else agglomerate joinStrat newDistMat newNNghbr newPQ newS newAcc -- | Obtain candidates for the clusters to join by looking at the minimal distance in the priority -- queue and the neighbourlist. L 11-13 {-# SCC getJoinCandidates #-} getJoinCandidates :: ( MonadThrow m, PrimMonad m, Mutable r Ix1 (e, Ix1), Ord e ) => MArray (PrimState m) r Ix1 (e, Ix1) -> PQ.PSQ Ix1 e -> m (Ix1, Ix1, e) getJoinCandidates nNghbr pq = do (a PQ.:-> d) <- case PQ.findMin pq of Nothing -> throwM $ IndexException "Empty priority queue" Just v -> return v (_, b) <- nNghbr `readM` a return (a, b, d) -- | If the minimal distance @d@ found is not the distance between @a@ and @b@ recalculate the -- neighbour list, update the priority queue and obtain a new set of a,b and a distance between them. -- L 14-20. {-# SCC recalculateNghbr #-} recalculateNghbr :: ( MonadThrow m, PrimMonad m, MonadUnliftIO m, PrimState m ~ RealWorld, OuterSlice r Ix2 e, Manifest (R r) Ix1 e, Mutable r Ix1 (e, Ix1), Mutable r Ix2 e, Ord e ) => (Ix1, Ix1, e) -> IntSet -> MArray (PrimState m) r Ix2 e -> MArray (PrimState m) r Ix1 (e, Ix1) -> PQ.PSQ Ix1 e -> m (Ix1, Ix1, e, MArray (PrimState m) r Ix1 (e, Ix1), PQ.PSQ Ix1 e) recalculateNghbr (cA, cB, d) s distMat nNghbr pq = do dAB <- distMat `readM` (cA :. cB) if d == dAB then return (cA, cB, d, nNghbr, pq) else do -- Recalculate the nearest neighbours just on index cA. Consider only clusters, that were not -- merged yet. dmRowA <- searchRow cA s distMat >>= unsafeFreeze Par newNeighbourA@(minDistA, _) <- minimumM dmRowA writeM nNghbr cA newNeighbourA -- Update the priority queue at key cA with the new distance. let newPQ = PQ.adjust (const minDistA) cA pq -- Determine new a, b and d from the updated neighbour list and priority queue. (a PQ.:-> newD) <- case PQ.findMin newPQ of Nothing -> throwM $ IndexException "Empty priority queue" Just v -> return v (_, b) <- nNghbr `readM` a recalculateNghbr (a, b, newD) s distMat nNghbr newPQ -- | Joins the selected clusters \(A\) and \(B\) and updates the dendrogram accumulator at index b. -- A will not be removed so that the accumulator never shrinks. -- L 21-24 {-# SCC joinClusters #-} joinClusters :: ( MonadThrow m, PrimMonad m, Ord e ) => Ix1 -> Ix1 -> e -> IntSet -> PQ.PSQ Ix1 e -> DendroAccM m e -> m (IntSet, PQ.PSQ Ix1 e, DendroAccM m e) joinClusters a b d s pq acc = do clA <- acc `readM` a let newPQ = PQ.deleteMin pq modifyM_ acc ( \clB -> return . Dendrogram $ Node ( DendroNode { distance = d, cluster = (cluster . root . unDendro $ clA) <> (cluster . root . unDendro $ clB) } ) (unDendro clA) (unDendro clB) ) b let newS = IntSet.delete a s return (newS, newPQ, acc) -- | Update the distance matrix with a Lance-Williams update in the rows and columns of cluster b. -- L 25-27 {-# SCC updateDistMat #-} updateDistMat :: ( MonadThrow m, PrimMonad m, MonadUnliftIO m, Mutable r Ix2 e, Fractional e ) => JoinStrat e -> Ix1 -> Ix1 -> IntSet -> MArray (PrimState m) r Ix2 e -> DendroAccM m e -> m (MArray (PrimState m) r Ix2 e) updateDistMat js a b s distMat dendroAcc | nDM /= nDM = throwM $ SizeMismatchException (Sz nDM) (Sz nCl) | mDM /= nDM = throwM $ SizeMismatchException (Sz mDM) (Sz nDM) | otherwise = do dAB <- distMat `readM` (a :. b) nA <- clSize a nB <- clSize b forIO_ ixV $ \ix -> do dAX <- distMat `readM` (a :. ix) nX <- clSize ix modifyM_ distMat (\dBX -> return $ lanceWilliams js nA nB nX dAB dAX dBX) (ix :. b) modifyM_ distMat (\dBX -> return $ lanceWilliams js nA nB nX dAB dAX dBX) (b :. ix) return distMat where Sz (mDM :. nDM) = msize distMat Sz nCl = msize dendroAcc ixV = Massiv.fromList @U Par . IntSet.toAscList . IntSet.delete b $ s clSize i = IntSet.size . cluster . root . unDendro <$> dendroAcc `readM` i -- | Updates the neighbourlist. All elements with a smaller index than a, that had a as a nearest -- neighbour are blindly redirected to the union of a and b, now at index b. -- L 28-32 {-# SCC redirectNeighbours #-} redirectNeighbours :: ( MonadThrow m, PrimMonad m, MonadUnliftIO m, Mutable r Ix1 (e, Ix1), Mutable r Ix2 e ) => Ix1 -> Ix1 -> IntSet -> MArray (PrimState m) r Ix2 e -> MArray (PrimState m) r Ix1 (e, Ix1) -> m (MArray (PrimState m) r Ix1 (e, Ix1)) redirectNeighbours a b s distMat nNghbr = do forIO_ ixV $ \ix -> modifyM_ nNghbr ( \old@(_, nghbrX) -> if nghbrX == a then distMat `readM` (ix :. b) >>= \dXB -> return (dXB, b) else return old ) ix return nNghbr where ixV = compute @U . sfilter (< a) . Massiv.fromList @U Par . IntSet.toAscList $ s -- | Updates the list of nearest neighbours for all combinations that might have changed by -- recalculation with the joined cluster AB at index b. -- L {-# SCC updateWithNewBDists #-} updateWithNewBDists :: ( MonadThrow m, MonadUnliftIO m, PrimMonad m, Mutable r Ix2 e, Mutable r Ix1 (e, Ix1), Ord e ) => Ix1 -> IntSet -> MArray (PrimState m) r Ix2 e -> MArray (PrimState m) r Ix1 (e, Ix1) -> PQ.PSQ Ix1 e -> m (MArray (PrimState m) r Ix1 (e, Ix1), PQ.PSQ Ix1 e) updateWithNewBDists b s distMat nNghbr pq = do pqT <- newTVarIO pq forIO_ ixV $ \ix -> do dBX <- distMat `readM` (ix :. b) currentPQ <- readTVarIO pqT minDistX <- case PQ.lookup ix currentPQ of Nothing -> throwM $ IndexException "Empty priority queue." Just v -> return v if dBX < minDistX then do writeM nNghbr ix (dBX, b) atomically . writeTVar pqT . PQ.adjust (const dBX) ix $ currentPQ else atomically . writeTVar pqT $ currentPQ newPQ <- readTVarIO pqT return (nNghbr, newPQ) where ixV = compute @U . Massiv.sfilter (< b) . Massiv.fromList @U Par . IntSet.toAscList $ s -- | Updates the list of nearest neighbours and the priority queue at key b. -- L 39-40 {-# SCC updateBNeighbour #-} updateBNeighbour :: ( MonadThrow m, PrimMonad m, MonadUnliftIO m, Mutable r Ix1 (e, Ix1), Mutable r Ix2 e, Ord e ) => Ix1 -> IntSet -> MArray (PrimState m) r Ix2 e -> MArray (PrimState m) r Ix1 (e, Ix1) -> PQ.PSQ Ix1 e -> m (MArray (PrimState m) r Ix1 (e, Ix1), PQ.PSQ Ix1 e) updateBNeighbour b s distMat nNghbr pq = if b >= nNeighbours then return (nNghbr, pq) else do rowAB <- searchRow b s distMat >>= unsafeFreeze Par newNeighbourB@(distB, neighbourB) <- minimumM rowAB writeM nNghbr b newNeighbourB let newPQ = PQ.adjust (const distB) neighbourB pq return (nNghbr, newPQ) where Sz nNeighbours = msize nNghbr -- | Find the nearest neighbour for each point from a distance matrix. For each point it stores the -- minimum distance and the index of the other point, that is the nearest neighbour but at a higher -- index. {-# SCC nearestNeighbours #-} nearestNeighbours :: ( MonadThrow m, Mutable r Ix1 e, Mutable r Ix1 (e, Ix1), OuterSlice r Ix2 e, Source (R r) Ix1 e, Ord e, Unbox e ) => Matrix r e -> m (Vector r (e, Ix1)) nearestNeighbours distMat | m /= n = throwM $ IndexException "Distance matrix is not square" | m == 0 = throwM $ IndexException "Distance matrix is empty" | otherwise = let rows = compute @B . outerSlices $ distMat minDistIx = Massiv.imap (\i v -> unsafePerformIO . minDistAtVec i . compute @U $ v) . init $ rows in return . compute $ minDistIx where Sz (m :. n) = size distMat -- Make a search row for distances. Takes row x from a distance matrix and zips them with their -- column index. Then keeps only the valid elements of the row, that are still part of the available -- points. A minimum or maximum search can be performed on the resulting vector and a valid pair of -- distance and index can be obtained. searchRow :: ( PrimMonad m, MonadThrow m, MonadUnliftIO m, Mutable r Ix2 e, Mutable r Ix1 (e, Ix1) ) => Ix1 -> IntSet -> MArray (PrimState m) r Ix2 e -> m (MArray (PrimState m) r Ix1 (e, Ix1)) searchRow x s dm = makeMArray Par (size ixV) $ \ix -> do dmIx <- ixV !? ix (dm `readM` (x :. dmIx)) >>= \dist -> return (dist, dmIx) where ixV :: Vector U Ix1 ixV = compute @U . sfilter (> x) . Massiv.fromList @U Par . IntSet.toAscList $ s