-- |
-- Module      :  ELynx.Data.MarkovProcess.AminoAcid
-- Description :  Amino acid rate matrices such as LG
-- Copyright   :  (c) Dominik Schrempf 2020
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Tue Jan 29 09:29:19 2019.
--
-- The order of amino acids is alphabetic.
module ELynx.Data.MarkovProcess.AminoAcid
  ( lg,
    lgCustom,
    wag,
    wagCustom,
    poisson,
    poissonCustom,
    gtr20,
  )
where

import Data.ByteString.Internal (c2w)
import Data.List (elemIndex)
import Data.Maybe (fromMaybe)
import qualified Data.Vector.Storable as V
import Data.Word (Word8)
import ELynx.Data.Alphabet.Alphabet
import ELynx.Data.MarkovProcess.RateMatrix
import ELynx.Data.MarkovProcess.SubstitutionModel
import Numeric.LinearAlgebra

n :: Int
n = 20

-- Some matrices have to be converted from PAML order to alphabetical order. See
-- 'pamlToAlphaVec' and 'pamlToAlphaMat'.

-- Amno acids in alphabetical order.
aaAlphaOrder :: [Word8]
aaAlphaOrder =
  map
    c2w
    [ 'A',
      'C',
      'D',
      'E',
      'F',
      'G',
      'H',
      'I',
      'K',
      'L',
      'M',
      'N',
      'P',
      'Q',
      'R',
      'S',
      'T',
      'V',
      'W',
      'Y'
    ]

-- Amino acids in PAML oder.
aaPamlOrder :: [Word8]
aaPamlOrder =
  map
    c2w
    [ 'A',
      'R',
      'N',
      'D',
      'C',
      'Q',
      'E',
      'G',
      'H',
      'I',
      'L',
      'K',
      'M',
      'F',
      'P',
      'S',
      'T',
      'W',
      'Y',
      'V'
    ]

-- -- This is a very slow implementation; since I only convert matrices once it
-- -- should not be a problem. A map would be better if performance is an issue.
-- pamlIndexToAlphaIndex :: Int -> Int
-- pamlIndexToAlphaIndex i = fromMaybe
--                           (error $ "Could not convert index " ++ show i ++ ".")
--                           (elemIndex aa aaAlphaOrder)
--   where aa = aaPamlOrder !! i

-- This is a very slow implementation; since I only convert matrices once it
-- should not be a problem. A map would be better if performance is an issue.
alphaIndexToPamlIndex :: Int -> Int
alphaIndexToPamlIndex i =
  fromMaybe
    (error $ "Could not convert index " ++ show i ++ ".")
    (elemIndex aa aaPamlOrder)
  where
    aa = aaAlphaOrder !! i

-- Convert an amino acid vector in PAML order to a vector in alphabetical order.
pamlToAlphaVec :: Vector R -> Vector R
pamlToAlphaVec v = build n (\i -> v ! alphaIndexToPamlIndex (round i))

-- Convert an amino acid matrix in PAML order to a matrix in alphabetical order.
pamlToAlphaMat :: Matrix R -> Matrix R
pamlToAlphaMat m =
  build
    (n, n)
    ( \i j -> m ! alphaIndexToPamlIndex (round i) ! alphaIndexToPamlIndex (round j)
    )

-- Lower triangular matrix of LG exchangeabilities in PAML order and in form of
-- a list.
lgExchRawPaml :: [Double]
lgExchRawPaml =
  [ 0.425093,
    0.276818,
    0.751878,
    0.395144,
    0.123954,
    5.076149,
    2.489084,
    0.534551,
    0.528768,
    0.062556,
    0.969894,
    2.807908,
    1.695752,
    0.523386,
    0.084808,
    1.038545,
    0.363970,
    0.541712,
    5.243870,
    0.003499,
    4.128591,
    2.066040,
    0.390192,
    1.437645,
    0.844926,
    0.569265,
    0.267959,
    0.348847,
    0.358858,
    2.426601,
    4.509238,
    0.927114,
    0.640543,
    4.813505,
    0.423881,
    0.311484,
    0.149830,
    0.126991,
    0.191503,
    0.010690,
    0.320627,
    0.072854,
    0.044265,
    0.008705,
    0.108882,
    0.395337,
    0.301848,
    0.068427,
    0.015076,
    0.594007,
    0.582457,
    0.069673,
    0.044261,
    0.366317,
    4.145067,
    0.536518,
    6.326067,
    2.145078,
    0.282959,
    0.013266,
    3.234294,
    1.807177,
    0.296636,
    0.697264,
    0.159069,
    0.137500,
    1.124035,
    0.484133,
    0.371004,
    0.025548,
    0.893680,
    1.672569,
    0.173735,
    0.139538,
    0.442472,
    4.273607,
    6.312358,
    0.656604,
    0.253701,
    0.052722,
    0.089525,
    0.017416,
    1.105251,
    0.035855,
    0.018811,
    0.089586,
    0.682139,
    1.112727,
    2.592692,
    0.023918,
    1.798853,
    1.177651,
    0.332533,
    0.161787,
    0.394456,
    0.075382,
    0.624294,
    0.419409,
    0.196961,
    0.508851,
    0.078281,
    0.249060,
    0.390322,
    0.099849,
    0.094464,
    4.727182,
    0.858151,
    4.008358,
    1.240275,
    2.784478,
    1.223828,
    0.611973,
    1.739990,
    0.990012,
    0.064105,
    0.182287,
    0.748683,
    0.346960,
    0.361819,
    1.338132,
    2.139501,
    0.578987,
    2.000679,
    0.425860,
    1.143480,
    1.080136,
    0.604545,
    0.129836,
    0.584262,
    1.033739,
    0.302936,
    1.136863,
    2.020366,
    0.165001,
    0.571468,
    6.472279,
    0.180717,
    0.593607,
    0.045376,
    0.029890,
    0.670128,
    0.236199,
    0.077852,
    0.268491,
    0.597054,
    0.111660,
    0.619632,
    0.049906,
    0.696175,
    2.457121,
    0.095131,
    0.248862,
    0.140825,
    0.218959,
    0.314440,
    0.612025,
    0.135107,
    1.165532,
    0.257336,
    0.120037,
    0.054679,
    5.306834,
    0.232523,
    0.299648,
    0.131932,
    0.481306,
    7.803902,
    0.089613,
    0.400547,
    0.245841,
    3.151815,
    2.547870,
    0.170887,
    0.083688,
    0.037967,
    1.959291,
    0.210332,
    0.245034,
    0.076701,
    0.119013,
    10.649107,
    1.702745,
    0.185202,
    1.898718,
    0.654683,
    0.296501,
    0.098369,
    2.188158,
    0.189510,
    0.249313
  ]

-- Exchangeabilities of LG model in alphabetical order.
lgExch :: ExchangeabilityMatrix
lgExch = pamlToAlphaMat $ exchFromListLower n lgExchRawPaml

normalizeSumVec :: Vector Double -> Vector Double
normalizeSumVec v = V.map (/ s) v
  where
    s = V.sum v
{-# INLINE normalizeSumVec #-}

-- Stationary distribution in PAML order.
lgStatDistPaml :: StationaryDistribution
lgStatDistPaml =
  normalizeSumVec $
    fromList
      [ 0.079066,
        0.055941,
        0.041977,
        0.053052,
        0.012937,
        0.040767,
        0.071586,
        0.057337,
        0.022355,
        0.062157,
        0.099081,
        0.064600,
        0.022951,
        0.042302,
        0.044040,
        0.061197,
        0.053287,
        0.012066,
        0.034155,
        0.069147
      ]

-- Stationary distribution of LG model in alphabetical order.
lgStatDist :: StationaryDistribution
lgStatDist = pamlToAlphaVec lgStatDistPaml

-- | LG substitution model.
lg :: SubstitutionModel
lg = substitutionModel Protein "LG" [] lgStatDist lgExch

-- | LG substitution model with maybe a name and a custom stationary distribution.
lgCustom :: Maybe String -> StationaryDistribution -> SubstitutionModel
lgCustom mnm d = substitutionModel Protein nm [] d lgExch
  where
    nm = fromMaybe "LG-Custom" mnm

-- WAG exchangeability list in PAML order.
wagExchRawPaml :: [Double]
wagExchRawPaml =
  [ 55.15710,
    50.98480,
    63.53460,
    73.89980,
    14.73040,
    542.94200,
    102.70400,
    52.81910,
    26.52560,
    3.02949,
    90.85980,
    303.55000,
    154.36400,
    61.67830,
    9.88179,
    158.28500,
    43.91570,
    94.71980,
    617.41600,
    2.13520,
    546.94700,
    141.67200,
    58.46650,
    112.55600,
    86.55840,
    30.66740,
    33.00520,
    56.77170,
    31.69540,
    213.71500,
    395.62900,
    93.06760,
    24.89720,
    429.41100,
    57.00250,
    24.94100,
    19.33350,
    18.69790,
    55.42360,
    3.94370,
    17.01350,
    11.39170,
    12.73950,
    3.04501,
    13.81900,
    39.79150,
    49.76710,
    13.15280,
    8.48047,
    38.42870,
    86.94890,
    15.42630,
    6.13037,
    49.94620,
    317.09700,
    90.62650,
    535.14200,
    301.20100,
    47.98550,
    7.40339,
    389.49000,
    258.44300,
    37.35580,
    89.04320,
    32.38320,
    25.75550,
    89.34960,
    68.31620,
    19.82210,
    10.37540,
    39.04820,
    154.52600,
    31.51240,
    17.41000,
    40.41410,
    425.74600,
    485.40200,
    93.42760,
    21.04940,
    10.27110,
    9.61621,
    4.67304,
    39.80200,
    9.99208,
    8.11339,
    4.99310,
    67.93710,
    105.94700,
    211.51700,
    8.88360,
    119.06300,
    143.85500,
    67.94890,
    19.50810,
    42.39840,
    10.94040,
    93.33720,
    68.23550,
    24.35700,
    69.61980,
    9.99288,
    41.58440,
    55.68960,
    17.13290,
    16.14440,
    337.07900,
    122.41900,
    397.42300,
    107.17600,
    140.76600,
    102.88700,
    70.49390,
    134.18200,
    74.01690,
    31.94400,
    34.47390,
    96.71300,
    49.39050,
    54.59310,
    161.32800,
    212.11100,
    55.44130,
    203.00600,
    37.48660,
    51.29840,
    85.79280,
    82.27650,
    22.58330,
    47.33070,
    145.81600,
    32.66220,
    138.69800,
    151.61200,
    17.19030,
    79.53840,
    437.80200,
    11.31330,
    116.39200,
    7.19167,
    12.97670,
    71.70700,
    21.57370,
    15.65570,
    33.69830,
    26.25690,
    21.24830,
    66.53090,
    13.75050,
    51.57060,
    152.96400,
    13.94050,
    52.37420,
    11.08640,
    24.07350,
    38.15330,
    108.60000,
    32.57110,
    54.38330,
    22.77100,
    19.63030,
    10.36040,
    387.34400,
    42.01700,
    39.86180,
    13.32640,
    42.84370,
    645.42800,
    21.60460,
    78.69930,
    29.11480,
    248.53900,
    200.60100,
    25.18490,
    19.62460,
    15.23350,
    100.21400,
    30.12810,
    58.87310,
    18.72470,
    11.83580,
    782.13000,
    180.03400,
    30.54340,
    205.84500,
    64.98920,
    31.48870,
    23.27390,
    138.82300,
    36.53690,
    31.47300
  ]

-- WAG exchangeability matrix n alphabetical order.
wagExch :: ExchangeabilityMatrix
wagExch = pamlToAlphaMat $ exchFromListLower n wagExchRawPaml

-- WAG stationary distribution in PAML order.
wagStatDistPaml :: StationaryDistribution
wagStatDistPaml =
  normalizeSumVec $
    fromList
      [ 0.0866279,
        0.043972,
        0.0390894,
        0.0570451,
        0.0193078,
        0.0367281,
        0.0580589,
        0.0832518,
        0.0244313,
        0.048466,
        0.086209,
        0.0620286,
        0.0195027,
        0.0384319,
        0.0457631,
        0.0695179,
        0.0610127,
        0.0143859,
        0.0352742,
        0.0708957
      ]

-- WAG stationary distribution in alphabetical order.
wagStatDist :: StationaryDistribution
wagStatDist = pamlToAlphaVec wagStatDistPaml

-- | LG substitution model.
wag :: SubstitutionModel
wag = substitutionModel Protein "WAG" [] wagStatDist wagExch

-- | LG substitution model with maybe a name and a custom stationary distribution.
wagCustom :: Maybe String -> StationaryDistribution -> SubstitutionModel
wagCustom mnm d = substitutionModel Protein nm [] d wagExch
  where
    nm = fromMaybe "WAG-Custom" mnm

matrixSetDiagToZero :: Matrix R -> Matrix R
matrixSetDiagToZero m = m - diag (takeDiag m)
{-# INLINE matrixSetDiagToZero #-}

uniformExch :: ExchangeabilityMatrix
uniformExch = matrixSetDiagToZero $ matrix n $ replicate (n * n) 1.0

poissonExch :: ExchangeabilityMatrix
poissonExch = uniformExch

uniformVec :: Vector Double
uniformVec = V.replicate n (1 / fromIntegral n)

-- | Poisson substitution model.
poisson :: SubstitutionModel
poisson = substitutionModel Protein "Poisson" [] uniformVec poissonExch

-- | Poisson substitution model with maybe a name and a custom stationary distribution.
poissonCustom :: Maybe String -> StationaryDistribution -> SubstitutionModel
poissonCustom mnm d = substitutionModel Protein nm [] d poissonExch
  where
    nm = fromMaybe "Poisson-Custom" mnm

-- | General time reversible (GTR) substitution model for amino acids.
gtr20 :: [Double] -> StationaryDistribution -> SubstitutionModel
gtr20 es d = substitutionModel Protein "GTR" es d e
  where
    e = exchFromListUpper n es