{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Simulate.Simulate
( simulateCmd )
where
import Control.Applicative ((<|>))
import Control.Concurrent
import Control.Concurrent.Async
import Control.Monad (unless,
when)
import Control.Monad.IO.Class
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as LC
import Data.Maybe
import qualified Data.Set as Set
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Encoding as LT
import Data.Tree
import qualified Data.Vector.Unboxed as V
import Numeric.LinearAlgebra hiding ((<>))
import System.Random.MWC
import Simulate.Options
import Simulate.PhyloModel
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.MarkovProcess.GammaRateHeterogeneity
import qualified ELynx.Data.MarkovProcess.MixtureModel as M
import qualified ELynx.Data.MarkovProcess.PhyloModel as P
import qualified ELynx.Data.MarkovProcess.SubstitutionModel as SM
import qualified ELynx.Data.Sequence.Alignment as A
import qualified ELynx.Data.Sequence.Sequence as Seq hiding
(name)
import ELynx.Data.Tree.MeasurableTree
import ELynx.Data.Tree.NamedTree
import ELynx.Data.Tree.Tree
import ELynx.Export.Sequence.Fasta
import ELynx.Import.MarkovProcess.EDMModelPhylobayes hiding
(Parser)
import ELynx.Import.MarkovProcess.SiteprofilesPhylobayes hiding
(Parser)
import ELynx.Import.Tree.Newick hiding (name)
import ELynx.Simulate.MarkovProcessAlongTree
import ELynx.Tools.ByteString
import ELynx.Tools.Concurrent
import ELynx.Tools.InputOutput
import ELynx.Tools.Misc
simulateAlignment :: (Measurable a, Named a)
=> P.PhyloModel -> Tree a -> Int -> GenIO
-> IO A.Alignment
simulateAlignment pm t n g = do
c <- getNumCapabilities
gs <- splitGen c g
let chunks = getChunks c n
leafStatesS <- case pm of
P.SubstitutionModel sm -> mapConcurrently
(\(num, gen) -> simulateAndFlatten num d e t gen) (zip chunks gs)
where d = SM.stationaryDistribution sm
e = SM.exchangeabilityMatrix sm
P.MixtureModel mm -> simulateAndFlattenMixtureModelPar n ws ds es t g
where
ws = vector $ M.getWeights mm
ds = map SM.stationaryDistribution $ M.getSubstitutionModels mm
es = map SM.exchangeabilityMatrix $ M.getSubstitutionModels mm
let leafStates = horizontalConcat leafStatesS
leafNames = map getName $ leaves t
code = P.getAlphabet pm
alph = A.all $ alphabetSpec code
sequences = [ Seq.Sequence sName code (V.fromList $ map (`Set.elemAt` alph) ss) |
(sName, ss) <- zip leafNames leafStates ]
return $ either error id $ A.fromSequences sequences
summarizeEDMComponents :: [EDMComponent] -> L.ByteString
summarizeEDMComponents cs = LC.pack
$ "Empiricial distribution mixture model with "
++ show (length cs) ++ " components."
reportModel :: Maybe FilePath -> P.PhyloModel -> Simulate ()
reportModel Nothing _ = $(logInfo) "No output file provided; omit detailed report of phylogenetic model."
reportModel (Just outFn) m = do
let modelFn = outFn <> ".model"
out "model definition (machine readable)" (bShow m <> "\n") (Just modelFn)
simulateCmd :: Maybe FilePath -> Simulate ()
simulateCmd outFn = do
a <- lift ask
let treeFile = argsTreeFile a
$(logInfo) ""
$(logInfo) $ T.pack $ "Read tree from file '" ++ treeFile ++ "'."
tree <- liftIO $ parseFileWith newick treeFile
$(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ summarize tree
let edmFile = argsEDMFile a
let sProfileFiles = argsSiteprofilesFiles a
$(logInfo) ""
$(logDebug) "Read EDM file or siteprofile files."
when (isJust edmFile && isJust sProfileFiles) $ error "Got both: --edm-file and --siteprofile-files."
edmCs <- case edmFile of
Nothing -> return Nothing
Just edmF -> do
$(logInfo) "Read EDM file."
liftIO $ Just <$> parseFileWith phylobayes edmF
maybe (return ()) ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents) edmCs
sProfiles <- case sProfileFiles of
Nothing -> return Nothing
Just fns -> do
$(logInfo) $ T.pack $ "Read siteprofiles from " ++ show (length fns) ++ " file(s)."
$(logDebug) $ T.pack $ "The file names are:" ++ show fns
xs <- liftIO $ mapM (parseFileWith siteprofiles) fns
return $ Just $ concat xs
maybe (return ()) ($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents) sProfiles
let edmCsOrSiteprofiles = edmCs <|> sProfiles
$(logInfo) "Read model string."
let ms = argsSubstitutionModelString a
mm = argsMixtureModelString a
mws = argsMixtureWeights a
eitherPhyloModel' = getPhyloModel ms mm mws edmCsOrSiteprofiles
phyloModel' <- case eitherPhyloModel' of
Left err -> lift $ error err
Right pm -> return pm
let maybeGammaParams = argsGammaParams a
phyloModel <- case maybeGammaParams of
Nothing -> do
$(logInfo) $ LT.toStrict $ LT.decodeUtf8
$ LC.unlines $ P.summarize phyloModel'
return phyloModel'
Just (n, alpha) -> do
$(logInfo) $ LT.toStrict $ LT.decodeUtf8
$ LC.intercalate "\n" $ P.summarize phyloModel' ++ summarizeGammaRateHeterogeneity n alpha
return $ expand n alpha phyloModel'
unless (isJust sProfiles) (
do reportModel outFn phyloModel
$(logInfo) ""
)
$(logInfo) "Simulate alignment."
let alignmentLength = argsLength a
$(logInfo) $ T.pack $ "Length: " <> show alignmentLength <> "."
let maybeSeed = argsMaybeSeed a
gen <- case maybeSeed of
Nothing -> $(logInfo) "Seed: random"
>> liftIO createSystemRandom
Just s -> $(logInfo) (T.pack ("Seed: " <> show s <> "."))
>> liftIO (initialize (V.fromList s))
alignment <- liftIO $ simulateAlignment phyloModel tree alignmentLength gen
let output = (sequencesToFasta . A.toSequences) alignment
outFile = (<> ".fasta") <$> outFn
$(logInfo) ""
out "simulated multi sequence alignment" output outFile