-- |
-- Module      :  ELynx.Data.MarkovProcess.AminoAcid
-- Description :  Amino acid rate matrices such as LG
-- Copyright   :  (c) Dominik Schrempf 2020
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Tue Jan 29 09:29:19 2019.
--
-- The order of amino acids is alphabetic.
module ELynx.Data.MarkovProcess.AminoAcid
  ( lg,
    lgCustom,
    wag,
    wagCustom,
    poisson,
    poissonCustom,
    gtr20,
  )
where

import Data.ByteString.Internal (c2w)
import Data.List (elemIndex)
import Data.Maybe (fromMaybe)
import qualified Data.Vector.Storable as V
import Data.Word (Word8)
import ELynx.Data.Alphabet.Alphabet
import ELynx.Data.MarkovProcess.RateMatrix
import ELynx.Data.MarkovProcess.SubstitutionModel
import Numeric.LinearAlgebra

n :: Int
n :: Int
n = Int
20

-- Some matrices have to be converted from PAML order to alphabetical order. See
-- 'pamlToAlphaVec' and 'pamlToAlphaMat'.

-- Amno acids in alphabetical order.
aaAlphaOrder :: [Word8]
aaAlphaOrder :: [Word8]
aaAlphaOrder =
  (Char -> Word8) -> [Char] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map
    Char -> Word8
c2w
    [ Char
'A',
      Char
'C',
      Char
'D',
      Char
'E',
      Char
'F',
      Char
'G',
      Char
'H',
      Char
'I',
      Char
'K',
      Char
'L',
      Char
'M',
      Char
'N',
      Char
'P',
      Char
'Q',
      Char
'R',
      Char
'S',
      Char
'T',
      Char
'V',
      Char
'W',
      Char
'Y'
    ]

-- Amino acids in PAML oder.
aaPamlOrder :: [Word8]
aaPamlOrder :: [Word8]
aaPamlOrder =
  (Char -> Word8) -> [Char] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map
    Char -> Word8
c2w
    [ Char
'A',
      Char
'R',
      Char
'N',
      Char
'D',
      Char
'C',
      Char
'Q',
      Char
'E',
      Char
'G',
      Char
'H',
      Char
'I',
      Char
'L',
      Char
'K',
      Char
'M',
      Char
'F',
      Char
'P',
      Char
'S',
      Char
'T',
      Char
'W',
      Char
'Y',
      Char
'V'
    ]

-- -- This is a very slow implementation; since I only convert matrices once it
-- -- should not be a problem. A map would be better if performance is an issue.
-- pamlIndexToAlphaIndex :: Int -> Int
-- pamlIndexToAlphaIndex i = fromMaybe
--                           (error $ "Could not convert index " ++ show i ++ ".")
--                           (elemIndex aa aaAlphaOrder)
--   where aa = aaPamlOrder !! i

-- This is a very slow implementation; since I only convert matrices once it
-- should not be a problem. A map would be better if performance is an issue.
alphaIndexToPamlIndex :: Int -> Int
alphaIndexToPamlIndex :: Int -> Int
alphaIndexToPamlIndex Int
i =
  Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe
    ([Char] -> Int
forall a. HasCallStack => [Char] -> a
error ([Char] -> Int) -> [Char] -> Int
forall a b. (a -> b) -> a -> b
$ [Char]
"Could not convert index " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
i [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
".")
    (Word8 -> [Word8] -> Maybe Int
forall a. Eq a => a -> [a] -> Maybe Int
elemIndex Word8
aa [Word8]
aaPamlOrder)
  where
    aa :: Word8
aa = [Word8]
aaAlphaOrder [Word8] -> Int -> Word8
forall a. [a] -> Int -> a
!! Int
i

-- Convert an amino acid vector in PAML order to a vector in alphabetical order.
pamlToAlphaVec :: Vector R -> Vector R
pamlToAlphaVec :: Vector R -> Vector R
pamlToAlphaVec Vector R
v = Int -> (R -> R) -> Vector R
forall d f (c :: * -> *) e. Build d f c e => d -> f -> c e
build Int
n (\R
i -> Vector R
v Vector R -> Int -> R
forall c t. Indexable c t => c -> Int -> t
! Int -> Int
alphaIndexToPamlIndex (R -> Int
forall a b. (RealFrac a, Integral b) => a -> b
round R
i))

-- Convert an amino acid matrix in PAML order to a matrix in alphabetical order.
pamlToAlphaMat :: Matrix R -> Matrix R
pamlToAlphaMat :: Matrix R -> Matrix R
pamlToAlphaMat Matrix R
m =
  (Int, Int) -> (R -> R -> R) -> Matrix R
forall d f (c :: * -> *) e. Build d f c e => d -> f -> c e
build
    (Int
n, Int
n)
    ( \R
i R
j -> Matrix R
m Matrix R -> Int -> Vector R
forall c t. Indexable c t => c -> Int -> t
! Int -> Int
alphaIndexToPamlIndex (R -> Int
forall a b. (RealFrac a, Integral b) => a -> b
round R
i) Vector R -> Int -> R
forall c t. Indexable c t => c -> Int -> t
! Int -> Int
alphaIndexToPamlIndex (R -> Int
forall a b. (RealFrac a, Integral b) => a -> b
round R
j)
    )

-- Lower triangular matrix of LG exchangeabilities in PAML order and in form of
-- a list.
lgExchRawPaml :: [Double]
lgExchRawPaml :: [R]
lgExchRawPaml =
  [ R
0.425093,
    R
0.276818,
    R
0.751878,
    R
0.395144,
    R
0.123954,
    R
5.076149,
    R
2.489084,
    R
0.534551,
    R
0.528768,
    R
0.062556,
    R
0.969894,
    R
2.807908,
    R
1.695752,
    R
0.523386,
    R
0.084808,
    R
1.038545,
    R
0.363970,
    R
0.541712,
    R
5.243870,
    R
0.003499,
    R
4.128591,
    R
2.066040,
    R
0.390192,
    R
1.437645,
    R
0.844926,
    R
0.569265,
    R
0.267959,
    R
0.348847,
    R
0.358858,
    R
2.426601,
    R
4.509238,
    R
0.927114,
    R
0.640543,
    R
4.813505,
    R
0.423881,
    R
0.311484,
    R
0.149830,
    R
0.126991,
    R
0.191503,
    R
0.010690,
    R
0.320627,
    R
0.072854,
    R
0.044265,
    R
0.008705,
    R
0.108882,
    R
0.395337,
    R
0.301848,
    R
0.068427,
    R
0.015076,
    R
0.594007,
    R
0.582457,
    R
0.069673,
    R
0.044261,
    R
0.366317,
    R
4.145067,
    R
0.536518,
    R
6.326067,
    R
2.145078,
    R
0.282959,
    R
0.013266,
    R
3.234294,
    R
1.807177,
    R
0.296636,
    R
0.697264,
    R
0.159069,
    R
0.137500,
    R
1.124035,
    R
0.484133,
    R
0.371004,
    R
0.025548,
    R
0.893680,
    R
1.672569,
    R
0.173735,
    R
0.139538,
    R
0.442472,
    R
4.273607,
    R
6.312358,
    R
0.656604,
    R
0.253701,
    R
0.052722,
    R
0.089525,
    R
0.017416,
    R
1.105251,
    R
0.035855,
    R
0.018811,
    R
0.089586,
    R
0.682139,
    R
1.112727,
    R
2.592692,
    R
0.023918,
    R
1.798853,
    R
1.177651,
    R
0.332533,
    R
0.161787,
    R
0.394456,
    R
0.075382,
    R
0.624294,
    R
0.419409,
    R
0.196961,
    R
0.508851,
    R
0.078281,
    R
0.249060,
    R
0.390322,
    R
0.099849,
    R
0.094464,
    R
4.727182,
    R
0.858151,
    R
4.008358,
    R
1.240275,
    R
2.784478,
    R
1.223828,
    R
0.611973,
    R
1.739990,
    R
0.990012,
    R
0.064105,
    R
0.182287,
    R
0.748683,
    R
0.346960,
    R
0.361819,
    R
1.338132,
    R
2.139501,
    R
0.578987,
    R
2.000679,
    R
0.425860,
    R
1.143480,
    R
1.080136,
    R
0.604545,
    R
0.129836,
    R
0.584262,
    R
1.033739,
    R
0.302936,
    R
1.136863,
    R
2.020366,
    R
0.165001,
    R
0.571468,
    R
6.472279,
    R
0.180717,
    R
0.593607,
    R
0.045376,
    R
0.029890,
    R
0.670128,
    R
0.236199,
    R
0.077852,
    R
0.268491,
    R
0.597054,
    R
0.111660,
    R
0.619632,
    R
0.049906,
    R
0.696175,
    R
2.457121,
    R
0.095131,
    R
0.248862,
    R
0.140825,
    R
0.218959,
    R
0.314440,
    R
0.612025,
    R
0.135107,
    R
1.165532,
    R
0.257336,
    R
0.120037,
    R
0.054679,
    R
5.306834,
    R
0.232523,
    R
0.299648,
    R
0.131932,
    R
0.481306,
    R
7.803902,
    R
0.089613,
    R
0.400547,
    R
0.245841,
    R
3.151815,
    R
2.547870,
    R
0.170887,
    R
0.083688,
    R
0.037967,
    R
1.959291,
    R
0.210332,
    R
0.245034,
    R
0.076701,
    R
0.119013,
    R
10.649107,
    R
1.702745,
    R
0.185202,
    R
1.898718,
    R
0.654683,
    R
0.296501,
    R
0.098369,
    R
2.188158,
    R
0.189510,
    R
0.249313
  ]

-- Exchangeabilities of LG model in alphabetical order.
lgExch :: ExchangeabilityMatrix
lgExch :: Matrix R
lgExch = Matrix R -> Matrix R
pamlToAlphaMat (Matrix R -> Matrix R) -> Matrix R -> Matrix R
forall a b. (a -> b) -> a -> b
$ Int -> [R] -> Matrix R
forall a.
(RealFrac a, Container Vector a) =>
Int -> [a] -> Matrix a
exchFromListLower Int
n [R]
lgExchRawPaml

normalizeSumVec :: Vector Double -> Vector Double
normalizeSumVec :: Vector R -> Vector R
normalizeSumVec Vector R
v = (R -> R) -> Vector R -> Vector R
forall a b.
(Storable a, Storable b) =>
(a -> b) -> Vector a -> Vector b
V.map (R -> R -> R
forall a. Fractional a => a -> a -> a
/ R
s) Vector R
v
  where
    s :: R
s = Vector R -> R
forall a. (Storable a, Num a) => Vector a -> a
V.sum Vector R
v
{-# INLINE normalizeSumVec #-}

-- Stationary distribution in PAML order.
lgStatDistPaml :: StationaryDistribution
lgStatDistPaml :: Vector R
lgStatDistPaml =
  Vector R -> Vector R
normalizeSumVec (Vector R -> Vector R) -> Vector R -> Vector R
forall a b. (a -> b) -> a -> b
$
    [R] -> Vector R
forall a. Storable a => [a] -> Vector a
fromList
      [ R
0.079066,
        R
0.055941,
        R
0.041977,
        R
0.053052,
        R
0.012937,
        R
0.040767,
        R
0.071586,
        R
0.057337,
        R
0.022355,
        R
0.062157,
        R
0.099081,
        R
0.064600,
        R
0.022951,
        R
0.042302,
        R
0.044040,
        R
0.061197,
        R
0.053287,
        R
0.012066,
        R
0.034155,
        R
0.069147
      ]

-- Stationary distribution of LG model in alphabetical order.
lgStatDist :: StationaryDistribution
lgStatDist :: Vector R
lgStatDist = Vector R -> Vector R
pamlToAlphaVec Vector R
lgStatDistPaml

-- | LG substitution model.
lg :: SubstitutionModel
lg :: SubstitutionModel
lg = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"LG" [] Vector R
lgStatDist Matrix R
lgExch

-- | LG substitution model with maybe a name and a custom stationary distribution.
lgCustom :: Maybe String -> StationaryDistribution -> SubstitutionModel
lgCustom :: Maybe [Char] -> Vector R -> SubstitutionModel
lgCustom Maybe [Char]
mnm Vector R
d = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
nm [] Vector R
d Matrix R
lgExch
  where
    nm :: [Char]
nm = [Char] -> Maybe [Char] -> [Char]
forall a. a -> Maybe a -> a
fromMaybe [Char]
"LG-Custom" Maybe [Char]
mnm

-- WAG exchangeability list in PAML order.
wagExchRawPaml :: [Double]
wagExchRawPaml :: [R]
wagExchRawPaml =
  [ R
55.15710,
    R
50.98480,
    R
63.53460,
    R
73.89980,
    R
14.73040,
    R
542.94200,
    R
102.70400,
    R
52.81910,
    R
26.52560,
    R
3.02949,
    R
90.85980,
    R
303.55000,
    R
154.36400,
    R
61.67830,
    R
9.88179,
    R
158.28500,
    R
43.91570,
    R
94.71980,
    R
617.41600,
    R
2.13520,
    R
546.94700,
    R
141.67200,
    R
58.46650,
    R
112.55600,
    R
86.55840,
    R
30.66740,
    R
33.00520,
    R
56.77170,
    R
31.69540,
    R
213.71500,
    R
395.62900,
    R
93.06760,
    R
24.89720,
    R
429.41100,
    R
57.00250,
    R
24.94100,
    R
19.33350,
    R
18.69790,
    R
55.42360,
    R
3.94370,
    R
17.01350,
    R
11.39170,
    R
12.73950,
    R
3.04501,
    R
13.81900,
    R
39.79150,
    R
49.76710,
    R
13.15280,
    R
8.48047,
    R
38.42870,
    R
86.94890,
    R
15.42630,
    R
6.13037,
    R
49.94620,
    R
317.09700,
    R
90.62650,
    R
535.14200,
    R
301.20100,
    R
47.98550,
    R
7.40339,
    R
389.49000,
    R
258.44300,
    R
37.35580,
    R
89.04320,
    R
32.38320,
    R
25.75550,
    R
89.34960,
    R
68.31620,
    R
19.82210,
    R
10.37540,
    R
39.04820,
    R
154.52600,
    R
31.51240,
    R
17.41000,
    R
40.41410,
    R
425.74600,
    R
485.40200,
    R
93.42760,
    R
21.04940,
    R
10.27110,
    R
9.61621,
    R
4.67304,
    R
39.80200,
    R
9.99208,
    R
8.11339,
    R
4.99310,
    R
67.93710,
    R
105.94700,
    R
211.51700,
    R
8.88360,
    R
119.06300,
    R
143.85500,
    R
67.94890,
    R
19.50810,
    R
42.39840,
    R
10.94040,
    R
93.33720,
    R
68.23550,
    R
24.35700,
    R
69.61980,
    R
9.99288,
    R
41.58440,
    R
55.68960,
    R
17.13290,
    R
16.14440,
    R
337.07900,
    R
122.41900,
    R
397.42300,
    R
107.17600,
    R
140.76600,
    R
102.88700,
    R
70.49390,
    R
134.18200,
    R
74.01690,
    R
31.94400,
    R
34.47390,
    R
96.71300,
    R
49.39050,
    R
54.59310,
    R
161.32800,
    R
212.11100,
    R
55.44130,
    R
203.00600,
    R
37.48660,
    R
51.29840,
    R
85.79280,
    R
82.27650,
    R
22.58330,
    R
47.33070,
    R
145.81600,
    R
32.66220,
    R
138.69800,
    R
151.61200,
    R
17.19030,
    R
79.53840,
    R
437.80200,
    R
11.31330,
    R
116.39200,
    R
7.19167,
    R
12.97670,
    R
71.70700,
    R
21.57370,
    R
15.65570,
    R
33.69830,
    R
26.25690,
    R
21.24830,
    R
66.53090,
    R
13.75050,
    R
51.57060,
    R
152.96400,
    R
13.94050,
    R
52.37420,
    R
11.08640,
    R
24.07350,
    R
38.15330,
    R
108.60000,
    R
32.57110,
    R
54.38330,
    R
22.77100,
    R
19.63030,
    R
10.36040,
    R
387.34400,
    R
42.01700,
    R
39.86180,
    R
13.32640,
    R
42.84370,
    R
645.42800,
    R
21.60460,
    R
78.69930,
    R
29.11480,
    R
248.53900,
    R
200.60100,
    R
25.18490,
    R
19.62460,
    R
15.23350,
    R
100.21400,
    R
30.12810,
    R
58.87310,
    R
18.72470,
    R
11.83580,
    R
782.13000,
    R
180.03400,
    R
30.54340,
    R
205.84500,
    R
64.98920,
    R
31.48870,
    R
23.27390,
    R
138.82300,
    R
36.53690,
    R
31.47300
  ]

-- WAG exchangeability matrix n alphabetical order.
wagExch :: ExchangeabilityMatrix
wagExch :: Matrix R
wagExch = Matrix R -> Matrix R
pamlToAlphaMat (Matrix R -> Matrix R) -> Matrix R -> Matrix R
forall a b. (a -> b) -> a -> b
$ Int -> [R] -> Matrix R
forall a.
(RealFrac a, Container Vector a) =>
Int -> [a] -> Matrix a
exchFromListLower Int
n [R]
wagExchRawPaml

-- WAG stationary distribution in PAML order.
wagStatDistPaml :: StationaryDistribution
wagStatDistPaml :: Vector R
wagStatDistPaml =
  Vector R -> Vector R
normalizeSumVec (Vector R -> Vector R) -> Vector R -> Vector R
forall a b. (a -> b) -> a -> b
$
    [R] -> Vector R
forall a. Storable a => [a] -> Vector a
fromList
      [ R
0.0866279,
        R
0.043972,
        R
0.0390894,
        R
0.0570451,
        R
0.0193078,
        R
0.0367281,
        R
0.0580589,
        R
0.0832518,
        R
0.0244313,
        R
0.048466,
        R
0.086209,
        R
0.0620286,
        R
0.0195027,
        R
0.0384319,
        R
0.0457631,
        R
0.0695179,
        R
0.0610127,
        R
0.0143859,
        R
0.0352742,
        R
0.0708957
      ]

-- WAG stationary distribution in alphabetical order.
wagStatDist :: StationaryDistribution
wagStatDist :: Vector R
wagStatDist = Vector R -> Vector R
pamlToAlphaVec Vector R
wagStatDistPaml

-- | LG substitution model.
wag :: SubstitutionModel
wag :: SubstitutionModel
wag = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"WAG" [] Vector R
wagStatDist Matrix R
wagExch

-- | LG substitution model with maybe a name and a custom stationary distribution.
wagCustom :: Maybe String -> StationaryDistribution -> SubstitutionModel
wagCustom :: Maybe [Char] -> Vector R -> SubstitutionModel
wagCustom Maybe [Char]
mnm Vector R
d = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
nm [] Vector R
d Matrix R
wagExch
  where
    nm :: [Char]
nm = [Char] -> Maybe [Char] -> [Char]
forall a. a -> Maybe a -> a
fromMaybe [Char]
"WAG-Custom" Maybe [Char]
mnm

matrixSetDiagToZero :: Matrix R -> Matrix R
matrixSetDiagToZero :: Matrix R -> Matrix R
matrixSetDiagToZero Matrix R
m = Matrix R
m Matrix R -> Matrix R -> Matrix R
forall a. Num a => a -> a -> a
- Vector R -> Matrix R
forall a. (Num a, Element a) => Vector a -> Matrix a
diag (Matrix R -> Vector R
forall t. Element t => Matrix t -> Vector t
takeDiag Matrix R
m)
{-# INLINE matrixSetDiagToZero #-}

uniformExch :: ExchangeabilityMatrix
uniformExch :: Matrix R
uniformExch = Matrix R -> Matrix R
matrixSetDiagToZero (Matrix R -> Matrix R) -> Matrix R -> Matrix R
forall a b. (a -> b) -> a -> b
$ Int -> [R] -> Matrix R
matrix Int
n ([R] -> Matrix R) -> [R] -> Matrix R
forall a b. (a -> b) -> a -> b
$ Int -> R -> [R]
forall a. Int -> a -> [a]
replicate (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
n) R
1.0

poissonExch :: ExchangeabilityMatrix
poissonExch :: Matrix R
poissonExch = Matrix R
uniformExch

uniformVec :: Vector Double
uniformVec :: Vector R
uniformVec = Int -> R -> Vector R
forall a. Storable a => Int -> a -> Vector a
V.replicate Int
n (R
1 R -> R -> R
forall a. Fractional a => a -> a -> a
/ Int -> R
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n)

-- | Poisson substitution model.
poisson :: SubstitutionModel
poisson :: SubstitutionModel
poisson = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"Poisson" [] Vector R
uniformVec Matrix R
poissonExch

-- | Poisson substitution model with maybe a name and a custom stationary distribution.
poissonCustom :: Maybe String -> StationaryDistribution -> SubstitutionModel
poissonCustom :: Maybe [Char] -> Vector R -> SubstitutionModel
poissonCustom Maybe [Char]
mnm Vector R
d = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
nm [] Vector R
d Matrix R
poissonExch
  where
    nm :: [Char]
nm = [Char] -> Maybe [Char] -> [Char]
forall a. a -> Maybe a -> a
fromMaybe [Char]
"Poisson-Custom" Maybe [Char]
mnm

-- | General time reversible (GTR) substitution model for amino acids.
gtr20 :: [Double] -> StationaryDistribution -> SubstitutionModel
gtr20 :: [R] -> Vector R -> SubstitutionModel
gtr20 [R]
es Vector R
d = Alphabet
-> [Char] -> [R] -> Vector R -> Matrix R -> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"GTR" [R]
es Vector R
d Matrix R
e
  where
    e :: Matrix R
e = Int -> [R] -> Matrix R
forall a.
(RealFrac a, Container Vector a) =>
Int -> [a] -> Matrix a
exchFromListUpper Int
n [R]
es