{- |
Module      :  Simulate.PhyloModel
Description :  Parse and interpret the model string
Copyright   :  (c) Dominik Schrempf 2019
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Fri Feb  1 13:32:16 2019.

-}

module Simulate.PhyloModel
  ( getPhyloModel
  ) where

import qualified Data.ByteString.Lazy.Char8                    as L
import           Data.Maybe
import           Data.Void
import           Data.Word                                     (Word8)
import           Numeric.LinearAlgebra                         (norm_1, size,
                                                                vector)
import           Text.Megaparsec
import           Text.Megaparsec.Byte
import           Text.Megaparsec.Byte.Lexer

import           ELynx.Data.MarkovProcess.AminoAcid
import           ELynx.Data.MarkovProcess.CXXModels
import qualified ELynx.Data.MarkovProcess.MixtureModel         as M
import           ELynx.Data.MarkovProcess.Nucleotide
import qualified ELynx.Data.MarkovProcess.PhyloModel           as P
import           ELynx.Data.MarkovProcess.RateMatrix
import qualified ELynx.Data.MarkovProcess.SubstitutionModel    as S
import           ELynx.Import.MarkovProcess.EDMModelPhylobayes (EDMComponent)
import           ELynx.Tools.ByteString
import           ELynx.Tools.Equality
import           ELynx.Tools.InputOutput

type Parser = Parsec Void L.ByteString

bs :: String -> L.ByteString
bs = L.pack

nNuc :: Int
-- nNuc = length (alphabet :: [Nucleotide])
nNuc = 4

nAA :: Int
-- nAA = length (alphabet :: [AminoAcid])
nAA = 20

-- Model parameters between square brackets.
paramsStart :: Word8
paramsStart = c2w '['

paramsEnd :: Word8
paramsEnd = c2w ']'

-- Stationary distribution between curly brackets.
sdStart :: Word8
sdStart = c2w '{'

sdEnd :: Word8
sdEnd = c2w '}'

-- Mixture model components between round brackets.
mmStart :: Word8
mmStart = c2w '('

mmEnd :: Word8
mmEnd = c2w ')'

separator :: Word8
separator = c2w ','

name :: Parser String
name = L.unpack <$>
  takeWhile1P (Just "Model name") (`notElem` [paramsStart, paramsEnd, sdStart, sdEnd, mmStart, mmEnd, separator])

params :: Parser [Double]
params = between (char paramsStart) (char paramsEnd) (sepBy1 float (char separator))

stationaryDistribution :: Parser StationaryDistribution
stationaryDistribution = do
  f <- vector <$> between (char sdStart) (char sdEnd) (sepBy1 float (char separator))
  if nearlyEq (norm_1 f) 1.0
    then return f
    else error $ "Sum of stationary distribution is " ++ show (norm_1 f)
         ++ " but should be 1.0."

assertLength :: StationaryDistribution -> Int -> a -> a
assertLength d n r = if size d /= n
                    then error $ "Length of stationary distribution is " ++ show (size d)
                         ++ " but should be " ++ show n ++ "."
                    else r

-- This is the main function that connects the model string, the parameters and
-- the stationary distribution. It should check that the model is valid.
assembleSubstitutionModel :: String -> Maybe S.Params -> Maybe StationaryDistribution
                          -> Either String S.SubstitutionModel
-- DNA models.
assembleSubstitutionModel "JC" Nothing Nothing = Right jc
assembleSubstitutionModel "HKY" (Just [k]) (Just d) = Right $ assertLength d nNuc $ hky k d
-- Protein models.
assembleSubstitutionModel "LG" Nothing Nothing = Right lg
assembleSubstitutionModel "LG-Custom" Nothing (Just d) = Right $ assertLength d nAA $ lgCustom Nothing d
assembleSubstitutionModel "WAG" Nothing Nothing = Right wag
assembleSubstitutionModel "WAG-Custom" Nothing (Just d) = Right $ assertLength d nAA $ wagCustom Nothing d
assembleSubstitutionModel "Poisson" Nothing Nothing = Right poisson
assembleSubstitutionModel "Poisson-Custom" Nothing (Just d) = Right $ assertLength d nAA $ poissonCustom Nothing d
-- Ohterwisse, we cannot assemble the model.
assembleSubstitutionModel n mps mf = Left $ unlines
  [ "Cannot assemble substitution model. "
  , "Name: " ++ show n
  , "Parameters: " ++ show mps
  , "Stationary distribution: " ++ show mf ]

parseSubstitutionModel :: Parser S.SubstitutionModel
parseSubstitutionModel = do
  n  <- name
  mps <- optional params
  mf <- optional stationaryDistribution
  let esm = assembleSubstitutionModel n mps mf
  case esm of
    Left err -> fail err
    Right sm -> return sm

edmModel :: [EDMComponent] -> Maybe [M.Weight] -> Parser M.MixtureModel
edmModel cs mws = do
  _ <- chunk (bs "EDM")
  _ <- char mmStart
  n <- name
  mps <- optional params
  _ <- char mmEnd
  let sms     = map (\c -> assembleSubstitutionModel n mps (Just $ snd c)) cs
      edmName = "EDM" ++ show (length cs)
      ws      = fromMaybe (map fst cs) mws
      errs    = [ e | (Left e) <- sms ]
  if not $ null errs
  then fail $ head errs
  else return $ M.MixtureModel edmName
    [ M.Component w sm | (w, Right sm) <- zip ws sms ]

cxxModel :: Maybe [M.Weight] -> Parser M.MixtureModel
cxxModel mws = do
  _ <- char (c2w 'C')
  n <- decimal :: Parser Int
  case cxx n mws of
    Nothing -> fail "Only 10, 20, 30, 40, 50, and 60 components are supported."
    Just m -> return m

standardMixtureModel :: [M.Weight] -> Parser M.MixtureModel
standardMixtureModel ws = do
  _ <- chunk (bs "MIXTURE")
  _ <- char mmStart
  sms <- parseSubstitutionModel `sepBy1` char separator
  _ <- char mmEnd
  return $ M.MixtureModel "MIXTURE"
    [ M.Component w sm | (w, sm) <- zip ws sms]

mixtureModel :: Maybe [EDMComponent] -> Maybe [M.Weight] -> Parser M.MixtureModel
mixtureModel Nothing   Nothing       = try (cxxModel Nothing) <|> fail "No weights provided."
mixtureModel Nothing   mws@(Just ws) = try (cxxModel mws) <|> standardMixtureModel ws
mixtureModel (Just cs) mws           = edmModel cs mws

-- | Parse the phylogenetic model string. The argument list is somewhat long,
-- but models can have many parameters and we have to check for redundant
-- parameters.
--
-- @
-- getPhyloModel maybeSubstitutionModelString maybeMixtureModelString maybeEDMComponents
-- @
getPhyloModel :: Maybe String -> Maybe String -> Maybe [M.Weight] -> Maybe [EDMComponent]
              -> Either String P.PhyloModel
getPhyloModel Nothing Nothing _ _ =
  Left "No model was given. See help."
getPhyloModel (Just _) (Just _) _ _ =
  Left "Both, substitution and mixture model string given; use only one."
getPhyloModel (Just s) Nothing Nothing Nothing =
  Right $ P.SubstitutionModel $ parseStringWith "Substitution model string" parseSubstitutionModel s
getPhyloModel (Just _) Nothing (Just _) _ =
  Left "Weights given; but cannot be used with substitution model."
getPhyloModel (Just _) Nothing _ (Just _) =
  Left "Empirical distribution mixture model components given; but cannot be used with substitution model."
getPhyloModel Nothing (Just m) mws mcs =
  Right $ P.MixtureModel $ parseStringWith "Mixture model string" (mixtureModel mcs mws) m