{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell   #-}

{- |
Module      :  Simulate.Simulate
Description :  Simulate multiple sequence alignments
Copyright   :  (c) Dominik Schrempf 2019
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Mon Jan 28 14:12:52 2019.

-}

module Simulate.Simulate
  ( simulateCmd )
where

import           Control.Applicative                               ((<|>))
import           Control.Concurrent
import           Control.Concurrent.Async
import           Control.Monad                                     (unless,
                                                                    when)
import           Control.Monad.IO.Class
import           Control.Monad.Logger
import           Control.Monad.Trans.Class
import           Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy                              as L
import qualified Data.ByteString.Lazy.Char8                        as LC
import           Data.Maybe
import qualified Data.Set                                          as Set
import qualified Data.Text                                         as T
import qualified Data.Text.Lazy                                    as LT
import qualified Data.Text.Lazy.Encoding                           as LT
import           Data.Tree
import qualified Data.Vector.Unboxed                               as V
import           Numeric.LinearAlgebra                             hiding ((<>))
import           System.Random.MWC

import           Simulate.Options
import           Simulate.PhyloModel

import           ELynx.Data.Alphabet.Alphabet                      as A
import           ELynx.Data.MarkovProcess.GammaRateHeterogeneity
import qualified ELynx.Data.MarkovProcess.MixtureModel             as M
import qualified ELynx.Data.MarkovProcess.PhyloModel               as P
import qualified ELynx.Data.MarkovProcess.SubstitutionModel        as SM
import qualified ELynx.Data.Sequence.Alignment                     as A
import qualified ELynx.Data.Sequence.Sequence                      as Seq hiding
                                                                           (name)
import           ELynx.Data.Tree.MeasurableTree
import           ELynx.Data.Tree.NamedTree
import           ELynx.Data.Tree.Tree
import           ELynx.Export.Sequence.Fasta
import           ELynx.Import.MarkovProcess.EDMModelPhylobayes     hiding
                                                                    (Parser)
import           ELynx.Import.MarkovProcess.SiteprofilesPhylobayes hiding
                                                                    (Parser)
import           ELynx.Import.Tree.Newick                          hiding (name)
import           ELynx.Simulate.MarkovProcessAlongTree

import           ELynx.Tools.ByteString
import           ELynx.Tools.Concurrent
import           ELynx.Tools.InputOutput
import           ELynx.Tools.Misc

-- Simulate a 'Alignment' for a given phylogenetic model,
-- phylogenetic tree, and alignment length.
simulateAlignment :: (Measurable a, Named a)
            => P.PhyloModel -> Tree a -> Int -> GenIO
            -> IO A.Alignment
simulateAlignment pm t n g = do
  c  <- getNumCapabilities
  gs <- splitGen c g
  let chunks = getChunks c n
  leafStatesS <- case pm of
    -- TODO: This parallelization is not very intelligent, because the matrices
    -- exponentiation is done in all threads. So ten threads will exponentiate
    -- the same matrix ten times.
    P.SubstitutionModel sm -> mapConcurrently
      (\(num, gen) -> simulateAndFlatten num d e t gen) (zip chunks gs)
      where d = SM.stationaryDistribution sm
            e = SM.exchangeabilityMatrix sm
    -- P.MixtureModel mm      -> mapConcurrently
    --   (\(num, gen) -> simulateAndFlattenNSitesAlongTreeMixtureModel num ws ds es t gen) (zip chunks gs)
    P.MixtureModel mm      -> simulateAndFlattenMixtureModelPar n ws ds es t g
      where
        ws = vector $ M.getWeights mm
        ds = map SM.stationaryDistribution $ M.getSubstitutionModels mm
        es = map SM.exchangeabilityMatrix  $ M.getSubstitutionModels mm
  -- XXX: The horizontal concatenation might be slow. If so, 'concatenateSeqs'
  -- or 'concatenateAlignments' can be used, which directly appends vectors.
  let leafStates = horizontalConcat leafStatesS
      leafNames  = map getName $ leaves t
      code       = P.getAlphabet pm
      -- XXX: Probably use type safe stuff here?
      alph       = A.all $ alphabetSpec code
      sequences  = [ Seq.Sequence sName code (V.fromList $ map (`Set.elemAt` alph) ss) |
                    (sName, ss) <- zip leafNames leafStates ]
  return $ either error id $ A.fromSequences sequences

-- Summarize EDM components; line to be printed to screen or log.
summarizeEDMComponents :: [EDMComponent] -> L.ByteString
summarizeEDMComponents cs = LC.pack
                            $ "Empiricial distribution mixture model with "
                            ++ show (length cs) ++ " components."

-- XXX. Maybe provide human readable model file. But then, why is this
-- necessary. A human readable summary is reported anyways, and for Protein
-- models the exchangeabilities are too many.
reportModel :: Maybe FilePath -> P.PhyloModel -> Simulate ()
reportModel Nothing      _ = $(logInfo) "No output file provided; omit detailed report of phylogenetic model."
reportModel (Just outFn) m = do
  let modelFn = outFn <> ".model"
  out "model definition (machine readable)" (bShow m <> "\n") (Just modelFn)

-- | Simulate sequences.
simulateCmd :: Maybe FilePath -> Simulate ()
simulateCmd outFn = do
  a <- lift ask
  let treeFile = argsTreeFile a

  $(logInfo) ""
  $(logInfo) $ T.pack $ "Read tree from file '" ++ treeFile ++ "'."
  tree <- liftIO $ parseFileWith newick treeFile
  $(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ summarize tree

  let edmFile = argsEDMFile a
  let sProfileFiles = argsSiteprofilesFiles a
  $(logInfo) ""
  $(logDebug) "Read EDM file or siteprofile files."
  when (isJust edmFile && isJust sProfileFiles) $ error "Got both: --edm-file and --siteprofile-files."
  edmCs <- case edmFile of
    Nothing   -> return Nothing
    Just edmF -> do
      $(logInfo) "Read EDM file."
      liftIO $ Just <$> parseFileWith phylobayes edmF
  maybe (return ()) ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents) edmCs
  sProfiles <- case sProfileFiles of
    Nothing   -> return Nothing
    Just fns -> do
      $(logInfo) $ T.pack $ "Read siteprofiles from " ++ show (length fns) ++ " file(s)."
      $(logDebug) $ T.pack $ "The file names are:" ++ show fns
      xs <- liftIO $ mapM (parseFileWith siteprofiles) fns
      return $ Just $ concat xs
  maybe (return ()) ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents) sProfiles
  let edmCsOrSiteprofiles = edmCs <|> sProfiles

  $(logInfo) "Read model string."
  let ms = argsSubstitutionModelString a
      mm = argsMixtureModelString a
      mws = argsMixtureWeights a
      eitherPhyloModel' = getPhyloModel ms mm mws edmCsOrSiteprofiles
  phyloModel' <- case eitherPhyloModel' of
    Left err -> lift $ error err
    Right pm -> return pm

  let maybeGammaParams = argsGammaParams a
  phyloModel <- case maybeGammaParams of
    Nothing         -> do
      $(logInfo) $ LT.toStrict $ LT.decodeUtf8
        $ LC.unlines $ P.summarize phyloModel'
      return phyloModel'
    Just (n, alpha) -> do
      $(logInfo) $ LT.toStrict $ LT.decodeUtf8
        $ LC.intercalate "\n" $ P.summarize phyloModel' ++ summarizeGammaRateHeterogeneity n alpha
      return $ expand n alpha phyloModel'

  -- -- XXX: Do not report possibly huge empirical distribution mixture models
  -- -- for now, because it takes too long and uses too much disk space :).
  unless (isJust sProfiles) (
    do reportModel outFn phyloModel
       $(logInfo) ""
    )

  $(logInfo) "Simulate alignment."
  let alignmentLength = argsLength a
  $(logInfo) $ T.pack $ "Length: " <> show alignmentLength <> "."
  let maybeSeed = argsMaybeSeed a
  gen <- case maybeSeed of
    Nothing -> $(logInfo) "Seed: random"
               >> liftIO createSystemRandom
    Just s  -> $(logInfo) (T.pack ("Seed: " <> show s <> "."))
               >> liftIO (initialize (V.fromList s))
  alignment <- liftIO $ simulateAlignment phyloModel tree alignmentLength gen
  let output = (sequencesToFasta . A.toSequences) alignment
      outFile = (<> ".fasta") <$> outFn
  $(logInfo) ""
  out "simulated multi sequence alignment" output outFile