{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module SubSample.SubSample
( subSampleCmd
)
where
import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Builder as LT
import qualified Data.Text.Lazy.Builder.Int as LT
import qualified Data.Vector as V
import System.Random.MWC
import SubSample.Options
import Tools
import ELynx.Data.Sequence.MultiSequenceAlignment
import ELynx.Export.Sequence.Fasta
import ELynx.Tools.InputOutput
getOutFilePaths :: String -> Int -> String -> [FilePath]
getOutFilePaths file n suffix = [ file ++ "." ++ digitStr i ++ "." ++ suffix
| i <- [0 .. n-1] ]
where nDigits = ceiling $ logBase (10 :: Double) (fromIntegral n)
digitStr i = T.unpack $ T.justifyRight nDigits '0' (LT.toStrict $ LT.toLazyText $ LT.decimal i)
subSampleCmd :: Maybe FilePath
-> SubSample ()
subSampleCmd outFileBaseName = do
SubSampleArguments al inFile nSites nAlignments seed <- lift ask
$(logInfo) "Command: Sub sample from a multi sequence alignment."
$(logInfo) $ T.pack $ " Sample " <> show nSites <> " sites."
$(logInfo) $ T.pack $ " Sample " <> show nAlignments <> " multi sequence alignments."
ss <- readSeqs al inFile
gen <- liftIO $ maybe createSystemRandom (initialize . V.fromList) seed
let msa = either error id (fromSequenceList ss)
samples <- lift $ replicateM nAlignments $ randomSubSample nSites msa gen
let results = map (sequencesToFasta . toSequenceList) samples
outFilePaths <- case outFileBaseName of
Nothing -> return $ repeat Nothing
Just fn -> return $ Just <$> getOutFilePaths fn nAlignments "fasta"
zipWithM_ (io "sub sampled multi sequence alignments") results outFilePaths