{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Filter.Filter
( filterRowsCmd
, filterColsCmd
)
where
import Control.Monad (when)
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy.Char8 as L
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Filter.Options
import Tools
import qualified ELynx.Data.Sequence.Alignment as M
import qualified ELynx.Data.Sequence.Sequence as S
import ELynx.Export.Sequence.Fasta
import ELynx.Tools.InputOutput
import ELynx.Tools.Misc
filterRows :: Maybe Int -> Maybe Int -> Bool -> [S.Sequence] -> L.ByteString
filterRows ml ms std ss = sequencesToFasta $ compose filters ss
where filters' = map (fromMaybe id) [S.filterLongerThan <$> ml, S.filterShorterThan <$> ms]
filters = if std then S.filterStandard : filters' else filters'
filterRowsCmd :: Maybe FilePath -> FilterRows ()
filterRowsCmd outFileBaseName = do
$(logInfo) "Command: Filter sequences of a list of sequences."
FilterRowsArguments al inFile long short std <- lift ask
maybe (return ())
(\val -> $(logInfo) $ T.pack $ " Keep sequences longer than " <> show val <> ".") long
maybe (return ())
(\val -> $(logInfo) $ T.pack $ " Keep sequences shorter than " <> show val <> ".") short
when std $
$(logInfo) " Keep sequences containing at least one standard (i.e., non-IUPAC) character."
ss <- readSeqs al inFile
let result = filterRows long short std ss
let outFilePath = (++ ".fasta") <$> outFileBaseName
out "filtered sequences" result outFilePath
filterCols :: Maybe Double -> [S.Sequence] -> L.ByteString
filterCols ms ss = sequencesToFasta . M.toSequences $ compose filters a
where a = either error id (M.fromSequences ss)
filters = map (fromMaybe id) [ M.filterColsStd <$> ms ]
filterColsCmd :: Maybe FilePath -> FilterCols ()
filterColsCmd outFileBaseName = do
$(logInfo) "Command: Filter columns of a multi sequence alignment."
FilterColsArguments al inFile standard <- lift ask
case standard of
Nothing -> return ()
Just p -> $(logInfo) $ T.pack $
" Keep columns with a proportion of standard (non-IUPAC) characters larger than "
++ show p ++ "."
ss <- readSeqs al inFile
let result = filterCols standard ss
let outFilePath = (++ ".fasta") <$> outFileBaseName
out "filtered sequences" result outFilePath