{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Examine.Examine
( examineCmd
)
where
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Set as S
import Text.Printf
import Examine.Options
import Tools
import qualified ELynx.Data.Alphabet.Alphabet as A
import qualified ELynx.Data.Alphabet.Character as C
import qualified ELynx.Data.Sequence.Alignment as M
import qualified ELynx.Data.Sequence.Sequence as Seq
import ELynx.Tools.InputOutput
examineAlignment :: Bool -> M.Alignment -> L.ByteString
examineAlignment perSiteFlag a =
L.unlines [ L.pack "Sequences have equal length (multi sequence alignment, or single sequence)."
, L.pack $ "Total number of columns in alignment: "
++ show (M.length a)
, L.pack $ "Number of columns without gaps: "
++ show (M.length aNoGaps)
, L.pack $ "Number of columns with standard characters only: "
++ show (M.length aOnlyStd)
, L.empty
, L.pack $ "Total number of characters: " ++ show nTot
, L.pack $ "Standard (i.e., not extended IUPAC) characters: "
++ show (nTot - nIUPAC - nGaps - nUnknowns)
, L.pack $ "Extended IUPAC characters: " ++ show nIUPAC
, L.pack $ "Gaps: " ++ show nGaps
, L.pack $ "Unknowns: " ++ show nUnknowns
, L.pack $ "Percentage of standard characters: "
++ printf "%.3f" (1.0 - percentageIUPAC - percentageGaps - percentageUnknowns)
, L.pack $ "Percentage of extended IUPAC characters: "
++ printf "%.3f" percentageIUPAC
, L.pack $ "Percentage of gaps: "
++ printf "%.3f" percentageGaps
, L.pack $ "Percentage of unknowns: "
++ printf "%.3f" percentageUnknowns
, L.empty
, L.pack "Distribution of characters:"
, L.pack $ concatMap ((: " ") . C.toChar) $ S.toList $
A.std $ A.alphabetSpec $ M.alphabet a
, L.pack $ unwords $ map (printf "%.3f") charFreqs
, L.empty
, L.pack "Mean effective number of states (measured using entropy):"
, L.pack "Across whole alignment: "
<> L.pack (printf "%.3f" kEffMean)
, L.pack "Across columns without gaps: "
<> L.pack (printf "%.3f" kEffMeanNoGaps)
, L.pack "Across columns without extended IUPAC characters: "
<> L.pack (printf "%.3f" kEffMeanOnlyStd)
]
<> perSiteBS
where
nTot = M.length a * M.nSequences a
nIUPAC = M.countIUPACChars a
nGaps = M.countGaps a
nUnknowns = M.countUnknowns a
percentageIUPAC = fromIntegral nIUPAC / fromIntegral nTot :: Double
percentageGaps = fromIntegral nGaps / fromIntegral nTot :: Double
percentageUnknowns = fromIntegral nUnknowns / fromIntegral nTot :: Double
aNoGaps = M.filterColsNoGaps a
aOnlyStd = M.filterColsOnlyStd aNoGaps
charFreqsPerSite = M.toFrequencyData a
charFreqs = M.distribution charFreqsPerSite
kEffs = M.kEffEntropy charFreqsPerSite
kEffsNoGaps = M.kEffEntropy . M.toFrequencyData $ aNoGaps
kEffsOnlyStd = M.kEffEntropy . M.toFrequencyData $ aOnlyStd
kEffMean = sum kEffs / fromIntegral (length kEffs)
kEffMeanNoGaps = sum kEffsNoGaps / fromIntegral (length kEffsNoGaps)
kEffMeanOnlyStd = sum kEffsOnlyStd / fromIntegral (length kEffsOnlyStd)
perSiteBS = if perSiteFlag
then L.unlines [ L.pack "Effective number of used states per site:"
, L.pack . show $ kEffs
]
else L.empty
examine :: Bool -> [Seq.Sequence] -> L.ByteString
examine perSiteFlag ss = Seq.summarizeSequences ss <>
case M.fromSequences ss of
Left _ -> L.empty
Right a -> L.pack "\n" <> examineAlignment perSiteFlag a
examineCmd :: Maybe FilePath -> Examine ()
examineCmd outFileBaseName = do
$(logInfo) "Command: Examine sequences."
ExamineArguments al inFile perSiteFlag <- lift ask
ss <- readSeqs al inFile
let result = examine perSiteFlag ss
let outFilePath = (++ ".out") <$> outFileBaseName
out "result of examination" result outFilePath