{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TemplateHaskell #-} {- | Module : Analyze.Analyze Description : Parse sequence file formats and analyze them Copyright : (c) Dominik Schrempf 2018 License : GPL-3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : portable Creation date: Fri Oct 5 08:41:05 2018. -} module Examine.Examine ( examineCmd ) where import Control.Monad.Logger import Control.Monad.Trans.Class import Control.Monad.Trans.Reader import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.Set as S import Text.Printf import Examine.Options import Tools import qualified ELynx.Data.Alphabet.Alphabet as A import qualified ELynx.Data.Alphabet.Character as C import qualified ELynx.Data.Sequence.Alignment as M import qualified ELynx.Data.Sequence.Sequence as Seq import ELynx.Tools.InputOutput examineAlignment :: Bool -> M.Alignment -> L.ByteString examineAlignment perSiteFlag a = L.unlines [ L.pack "Sequences have equal length (multi sequence alignment, or single sequence)." , L.pack $ "Total number of columns in alignment: " ++ show (M.length a) , L.pack $ "Number of columns without gaps: " ++ show (M.length aNoGaps) , L.pack $ "Number of columns with standard characters only: " ++ show (M.length aOnlyStd) , L.empty , L.pack $ "Total number of characters: " ++ show nTot , L.pack $ "Standard (i.e., not extended IUPAC) characters: " ++ show (nTot - nIUPAC - nGaps - nUnknowns) , L.pack $ "Extended IUPAC characters: " ++ show nIUPAC , L.pack $ "Gaps: " ++ show nGaps , L.pack $ "Unknowns: " ++ show nUnknowns , L.pack $ "Percentage of standard characters: " ++ printf "%.3f" (1.0 - percentageIUPAC - percentageGaps - percentageUnknowns) , L.pack $ "Percentage of extended IUPAC characters: " ++ printf "%.3f" percentageIUPAC , L.pack $ "Percentage of gaps: " ++ printf "%.3f" percentageGaps , L.pack $ "Percentage of unknowns: " ++ printf "%.3f" percentageUnknowns , L.empty , L.pack "Distribution of characters:" -- Holy crap. , L.pack $ concatMap ((: " ") . C.toChar) $ S.toList $ A.std $ A.alphabetSpec $ M.alphabet a , L.pack $ unwords $ map (printf "%.3f") charFreqs , L.empty , L.pack "Mean effective number of states (measured using entropy):" , L.pack "Across whole alignment: " <> L.pack (printf "%.3f" kEffMean) , L.pack "Across columns without gaps: " <> L.pack (printf "%.3f" kEffMeanNoGaps) , L.pack "Across columns without extended IUPAC characters: " <> L.pack (printf "%.3f" kEffMeanOnlyStd) ] <> perSiteBS where nTot = M.length a * M.nSequences a nIUPAC = M.countIUPACChars a nGaps = M.countGaps a nUnknowns = M.countUnknowns a percentageIUPAC = fromIntegral nIUPAC / fromIntegral nTot :: Double percentageGaps = fromIntegral nGaps / fromIntegral nTot :: Double percentageUnknowns = fromIntegral nUnknowns / fromIntegral nTot :: Double aNoGaps = M.filterColsNoGaps a aOnlyStd = M.filterColsOnlyStd aNoGaps charFreqsPerSite = M.toFrequencyData a charFreqs = M.distribution charFreqsPerSite kEffs = M.kEffEntropy charFreqsPerSite kEffsNoGaps = M.kEffEntropy . M.toFrequencyData $ aNoGaps kEffsOnlyStd = M.kEffEntropy . M.toFrequencyData $ aOnlyStd kEffMean = sum kEffs / fromIntegral (length kEffs) kEffMeanNoGaps = sum kEffsNoGaps / fromIntegral (length kEffsNoGaps) kEffMeanOnlyStd = sum kEffsOnlyStd / fromIntegral (length kEffsOnlyStd) perSiteBS = if perSiteFlag then L.unlines [ L.pack "Effective number of used states per site:" , L.pack . show $ kEffs ] else L.empty examine :: Bool -> [Seq.Sequence] -> L.ByteString examine perSiteFlag ss = Seq.summarizeSequences ss <> case M.fromSequences ss of Left _ -> L.empty Right a -> L.pack "\n" <> examineAlignment perSiteFlag a -- | Examine sequences. examineCmd :: Maybe FilePath -> Examine () examineCmd outFileBaseName = do $(logInfo) "Command: Examine sequences." ExamineArguments al inFile perSiteFlag <- lift ask ss <- readSeqs al inFile let result = examine perSiteFlag ss let outFilePath = (++ ".out") <$> outFileBaseName out "result of examination" result outFilePath