{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TemplateHaskell #-} {- | Module : Simulate.Simulate Description : Simulate multiple sequence alignments Copyright : (c) Dominik Schrempf 2019 License : GPL-3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : portable Creation date: Mon Jan 28 14:12:52 2019. -} module Simulate.Simulate ( simulateCmd ) where import Control.Concurrent import Control.Concurrent.Async import Control.Lens import Control.Monad.IO.Class import Control.Monad.Logger import Control.Monad.Trans.Class import Control.Monad.Trans.Reader import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Char8 as LC import qualified Data.Set as Set import qualified Data.Text as T import qualified Data.Text.Lazy as LT import qualified Data.Text.Lazy.Encoding as LT import Data.Tree import qualified Data.Vector.Unboxed as V import Numeric.LinearAlgebra hiding ((<>)) import System.Random.MWC import Simulate.Options import Simulate.PhyloModel import ELynx.Data.Alphabet.Alphabet as A import ELynx.Data.MarkovProcess.GammaRateHeterogeneity import qualified ELynx.Data.MarkovProcess.MixtureModel as M import qualified ELynx.Data.MarkovProcess.PhyloModel as P import qualified ELynx.Data.MarkovProcess.SubstitutionModel as S import ELynx.Data.Sequence.MultiSequenceAlignment import ELynx.Data.Sequence.Sequence hiding (name) import ELynx.Data.Tree.MeasurableTree import ELynx.Data.Tree.NamedTree import ELynx.Data.Tree.Tree import ELynx.Export.Sequence.Fasta import ELynx.Import.MarkovProcess.EDMModelPhylobayes hiding (Parser) import ELynx.Import.Tree.Newick hiding (name) import ELynx.Simulate.MarkovProcessAlongTree import ELynx.Tools.ByteString import ELynx.Tools.Concurrent import ELynx.Tools.InputOutput import ELynx.Tools.Misc -- Simulate a 'MultiSequenceAlignment' for a given phylogenetic model, -- phylogenetic tree, and alignment length. simulateMSA :: (Measurable a, Named a) => P.PhyloModel -> Tree a -> Int -> GenIO -> IO MultiSequenceAlignment simulateMSA pm t n g = do c <- getNumCapabilities gs <- splitGen c g let chunks = getChunks c n leafStatesS <- case pm of P.SubstitutionModel sm -> mapConcurrently (\(num, gen) -> simulateAndFlattenNSitesAlongTree num d e t gen) (zip chunks gs) where d = sm ^. S.stationaryDistribution e = sm ^. S.exchangeabilityMatrix P.MixtureModel mm -> mapConcurrently (\(num, gen) -> simulateAndFlattenNSitesAlongTreeMixtureModel num ws ds es t gen) (zip chunks gs) where ws = vector $ M.getWeights mm ds = map (view S.stationaryDistribution) $ M.getSubstitutionModels mm es = map (view S.exchangeabilityMatrix) $ M.getSubstitutionModels mm -- XXX: The horizontal concatenation might be slow. If so, 'concatenateSeqs' -- or 'concatenateMSAs' can be used, which directly appends vectors. let leafStates = horizontalConcat leafStatesS leafNames = map getName $ leaves t code = P.getAlphabet pm -- XXX: Probably use type safe stuff here? alph = A.all $ alphabetSpec code sequences = [ Sequence sName code (V.fromList $ map (`Set.elemAt` alph) ss) | (sName, ss) <- zip leafNames leafStates ] return $ either error id $ fromSequenceList sequences -- Summarize EDM components; line to be printed to screen or log. summarizeEDMComponents :: [EDMComponent] -> L.ByteString summarizeEDMComponents cs = LC.pack $ "Empiricial distribution mixture model with " ++ show (length cs) ++ " components." -- XXX. Maybe provide human readable model file. But then, why is this -- necessary. A human readable summary is reported anyways, and for Protein -- models the exchangeabilities are too many. reportModel :: Maybe FilePath -> P.PhyloModel -> Simulate () reportModel outFn m = do let modelFn = (<> ".model") <$> outFn io "model definition (machine readable)" (bsShow m <> "\n") modelFn -- | Simulate sequences. simulateCmd :: Maybe FilePath -> Simulate () simulateCmd outFn = do a <- lift ask $(logInfo) "Read tree." let treeFile = argsTreeFile a tree <- liftIO $ parseFileWith newick treeFile $(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ summarize tree let edmFile = argsEDMFile a edmCs <- case edmFile of Nothing -> return Nothing Just edmF -> do $(logInfo) "Read EDM file." liftIO $ Just <$> parseFileWith phylobayes edmF maybe (return ()) ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents) edmCs $(logInfo) "Read model string." let ms = argsSubstitutionModelString a mm = argsMixtureModelString a mws = argsMixtureWeights a eitherPhyloModel' = getPhyloModel ms mm mws edmCs phyloModel' <- case eitherPhyloModel' of Left err -> lift $ error err Right pm -> return pm let maybeGammaParams = argsGammaParams a phyloModel <- case maybeGammaParams of Nothing -> do $(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ LC.unlines $ P.summarize phyloModel' return phyloModel' Just (n, alpha) -> do $(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ LC.unlines $ P.summarize phyloModel' ++ summarizeGammaRateHeterogeneity n alpha return $ expand n alpha phyloModel' reportModel outFn phyloModel $(logInfo) "Simulate alignment." let alignmentLength = argsLength a $(logInfo) $ T.pack $ "Length: " <> show alignmentLength <> "." let maybeSeed = argsMaybeSeed a gen <- case maybeSeed of Nothing -> $(logInfo) "Seed: random" >> liftIO createSystemRandom Just s -> $(logInfo) (T.pack ("Seed: " <> show s <> ".")) >> liftIO (initialize (V.fromList s)) msa <- liftIO $ simulateMSA phyloModel tree alignmentLength gen let output = (sequencesToFasta . toSequenceList) msa outFile = (<> ".fasta") <$> outFn io "simulated multi sequence alignment" output outFile