{-# LANGUAGE FlexibleContexts #-}

-- |
-- Module      :  ELynx.Data.Alphabet.DistributionDiversity
-- Description :  Summarize statistics for alphabets
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Mon Feb 25 13:32:56 2019.
module ELynx.Data.Alphabet.DistributionDiversity
  ( -- * Entropy
    entropy,
    kEffEntropy,

    -- * Homoplasy
    homoplasy,
    kEffHomoplasy,

    -- * Count characters
    frequencyCharacters,
  )
where

import qualified Data.Set as S
import Data.Vector.Generic
  ( Vector,
    toList,
  )
import qualified Data.Vector.Generic as V
import ELynx.Data.Alphabet.Alphabet
import ELynx.Data.Alphabet.Character

eps :: Double
eps :: Double
eps = Double
1e-12

-- Calculate x*log(x) but set to 0.0 when x is smaller than 'eps'.
xLogX :: Double -> Double
xLogX :: Double -> Double
xLogX Double
x
  | Double
x Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
0.0 = [Char] -> Double
forall a. HasCallStack => [Char] -> a
error [Char]
"Argument lower than zero."
  | Double
eps Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
> Double
x = Double
0.0
  | Bool
otherwise = Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double -> Double
forall a. Floating a => a -> a
log Double
x

-- | Entropy of vector.
entropy :: (Vector v Double) => v Double -> Double
entropy :: v Double -> Double
entropy v Double
v =
  if Double -> Bool
forall a. RealFloat a => a -> Bool
isNaN Double
res
    then
      [Char] -> Double
forall a. HasCallStack => [Char] -> a
error
        ([Char]
"entropy: Sesult of following vector is NaN: " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Double] -> [Char]
forall a. Show a => a -> [Char]
show (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
toList v Double
v) [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
".")
    else Double
res
  where
    res :: Double
res = Double -> Double
forall a. Num a => a -> a
negate (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ v Double -> Double
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
V.sum (v Double -> Double) -> v Double -> Double
forall a b. (a -> b) -> a -> b
$ (Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map Double -> Double
xLogX v Double
v

-- | Effective number of used characters measured using 'entropy'. The result
-- only makes sense when the sum of the array is 1.0.
kEffEntropy :: Vector v Double => v Double -> Double
kEffEntropy :: v Double -> Double
kEffEntropy v Double
v = if Double
e Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
eps then Double
1.0 else Double -> Double
forall a. Floating a => a -> a
exp Double
e where e :: Double
e = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
entropy v Double
v

-- | Probability of homoplasy of vector. The result is the probability of
-- binomially sampling the same character twice and only makes sense when the
-- sum of the array is 1.0.
homoplasy :: Vector v Double => v Double -> Double
homoplasy :: v Double -> Double
homoplasy v Double
v = v Double -> Double
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
V.sum (v Double -> Double) -> v Double -> Double
forall a b. (a -> b) -> a -> b
$ (Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (\Double
x -> Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
x) v Double
v

-- | Effective number of used characters measured using 'homoplasy'. The result
-- only makes sense when the sum of the array is 1.0.
kEffHomoplasy :: Vector v Double => v Double -> Double
kEffHomoplasy :: v Double -> Double
kEffHomoplasy v Double
v = Double
1.0 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
homoplasy v Double
v

-- XXX: Use mutable vector; then V.// is much faster.
-- Increment element at index in vector by one.
incrementElemIndexByOne :: Vector v Int => [Int] -> v Int -> v Int
incrementElemIndexByOne :: [Int] -> v Int -> v Int
incrementElemIndexByOne [Int]
is v Int
v = v Int
v v Int -> [(Int, Int)] -> v Int
forall (v :: * -> *) a. Vector v a => v a -> [(Int, a)] -> v a
V.// [Int] -> [Int] -> [(Int, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Int]
is [Int]
es'
  where
    es' :: [Int]
es' = [v Int
v v Int -> Int -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
V.! Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1 | Int
i <- [Int]
is]

-- For a given code and counts vector, increment the count of the given character.
acc :: Vector v Int => AlphabetSpec -> v Int -> Character -> v Int
acc :: AlphabetSpec -> v Int -> Character -> v Int
acc AlphabetSpec
alph v Int
vec Character
char = [Int] -> v Int -> v Int
forall (v :: * -> *). Vector v Int => [Int] -> v Int -> v Int
incrementElemIndexByOne [Int]
is v Int
vec
  where
    is :: [Int]
is = [Character -> Set Character -> Int
forall a. Ord a => a -> Set a -> Int
S.findIndex Character
c (AlphabetSpec -> Set Character
std AlphabetSpec
alph) | Character
c <- AlphabetSpec -> Character -> [Character]
toStd AlphabetSpec
alph Character
char]

countCharacters ::
  (Vector v Character, Vector v Int) => AlphabetSpec -> v Character -> v Int
countCharacters :: AlphabetSpec -> v Character -> v Int
countCharacters AlphabetSpec
alph = (v Int -> Character -> v Int) -> v Int -> v Character -> v Int
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
V.foldl' (AlphabetSpec -> v Int -> Character -> v Int
forall (v :: * -> *).
Vector v Int =>
AlphabetSpec -> v Int -> Character -> v Int
acc AlphabetSpec
alph) v Int
zeroCounts
  where
    nChars :: Int
nChars = Set Character -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length (AlphabetSpec -> Set Character
std AlphabetSpec
alph)
    zeroCounts :: v Int
zeroCounts = Int -> Int -> v Int
forall (v :: * -> *) a. Vector v a => Int -> a -> v a
V.replicate Int
nChars (Int
0 :: Int)

saveDivision :: Int -> Int -> Double
saveDivision :: Int -> Int -> Double
saveDivision Int
value Int
divisor =
  if Int
divisor Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 then Double
0.0 else Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
value Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
divisor

-- | For a given code vector of characters, calculate frequency of characters.
-- The input vector has arbitrary length (most often the number of sequences in
-- an alignment), the length of the output vector is the number of characters in
-- the alphabet.
frequencyCharacters ::
  (Vector v Character, Vector v Int, Vector v Double) =>
  AlphabetSpec ->
  v Character ->
  v Double
frequencyCharacters :: AlphabetSpec -> v Character -> v Double
frequencyCharacters AlphabetSpec
alph v Character
d = (Int -> Double) -> v Int -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Int -> Int -> Double
`saveDivision` Int
s) v Int
counts
  where
    counts :: v Int
counts = AlphabetSpec -> v Character -> v Int
forall (v :: * -> *).
(Vector v Character, Vector v Int) =>
AlphabetSpec -> v Character -> v Int
countCharacters AlphabetSpec
alph v Character
d
    s :: Int
s = v Int -> Int
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
V.sum v Int
counts