{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module SLynx.Examine.Examine
( examineCmd,
)
where
import Control.Monad.Logger
import Control.Monad.Trans.Reader (ask)
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Set as S
import qualified ELynx.Data.Alphabet.Alphabet as A
import qualified ELynx.Data.Alphabet.Character as C
import qualified ELynx.Data.Sequence.Alignment as M
import qualified ELynx.Data.Sequence.Sequence as Seq
import ELynx.Tools
import SLynx.Examine.Options
import SLynx.Tools
import Text.Printf
pRow :: String -> String -> BL.ByteString
pRow name val = alignLeft 50 n <> alignRight 10 v
where
n = BL.pack name
v = BL.pack val
examineAlignment :: Bool -> M.Alignment -> BL.ByteString
examineAlignment perSiteFlag a =
BL.unlines
[ BL.pack
"Sequences have equal length (multi sequence alignment, or single sequence).",
pRow "Total number of columns in alignment:" $ show (M.length a),
pRow "Number of columns without gaps:" $ show (M.length aNoGaps),
pRow "Number of columns with standard characters only:" $
show (M.length aOnlyStd),
BL.empty,
pRow "Total number of characters:" $ show nTot,
pRow "Standard (i.e., not extended IUPAC) characters:" $
show (nTot - nIUPAC - nGaps - nUnknowns),
pRow "Extended IUPAC characters:" $ show nIUPAC,
pRow "Gaps:" $ show nGaps,
pRow "Unknowns:" $ show nUnknowns,
BL.empty,
pRow "Percentage of standard characters:" $
printf "%2.2f" (100.0 - percentIUPAC - percentGaps - percentUnknowns),
pRow "Percentage of extended IUPAC characters:" $
printf "%2.2f" percentIUPAC,
pRow "Percentage of gaps:" $ printf "%2.2f" percentGaps,
pRow "Percentage of unknowns:" $ printf "%2.2f" percentUnknowns,
BL.empty,
BL.pack "Distribution of characters:",
BL.pack $
concatMap ((: " ") . C.toChar) $
S.toList $
A.std $
A.alphabetSpec $
M.alphabet a,
BL.pack $ unwords $ map (printf "%.3f") charFreqs,
BL.empty,
BL.pack "Mean effective number of states (measured using entropy):",
pRow "Across whole alignment:" $ printf "%.3f" kEffMean,
pRow "Across columns without gaps:" $ printf "%.3f" kEffMeanNoGaps,
pRow "Across columns without extended IUPAC characters:" $
printf "%.3f" kEffMeanOnlyStd,
BL.empty,
BL.pack "Mean effective number of states (measured using homoplasy):",
pRow "Across whole alignment:" $ printf "%.3f" kEffMeanHomo,
pRow "Across columns without gaps:" $ printf "%.3f" kEffMeanNoGapsHomo,
pRow "Across columns without extended IUPAC characters:" $
printf "%.3f" kEffMeanOnlyStdHomo
]
<> perSiteBS
where
nTot = M.length a * M.nSequences a
nIUPAC = M.countIUPACChars a
nGaps = M.countGaps a
nUnknowns = M.countUnknowns a
percentIUPAC = 100 * fromIntegral nIUPAC / fromIntegral nTot :: Double
percentGaps = 100 * fromIntegral nGaps / fromIntegral nTot :: Double
percentUnknowns = 100 * fromIntegral nUnknowns / fromIntegral nTot :: Double
aNoGaps = M.filterColsNoGaps a
aOnlyStd = M.filterColsOnlyStd aNoGaps
charFreqsPerSite = M.toFrequencyData a
charFreqs = M.distribution charFreqsPerSite
kEffs = M.kEffEntropy charFreqsPerSite
kEffsNoGaps = M.kEffEntropy . M.toFrequencyData $ aNoGaps
kEffsOnlyStd = M.kEffEntropy . M.toFrequencyData $ aOnlyStd
kEffMean = sum kEffs / fromIntegral (length kEffs)
kEffMeanNoGaps = sum kEffsNoGaps / fromIntegral (length kEffsNoGaps)
kEffMeanOnlyStd = sum kEffsOnlyStd / fromIntegral (length kEffsOnlyStd)
kEffsHomo = M.kEffHomoplasy charFreqsPerSite
kEffsNoGapsHomo = M.kEffHomoplasy . M.toFrequencyData $ aNoGaps
kEffsOnlyStdHomo = M.kEffHomoplasy . M.toFrequencyData $ aOnlyStd
kEffMeanHomo = sum kEffsHomo / fromIntegral (length kEffsHomo)
kEffMeanNoGapsHomo =
sum kEffsNoGapsHomo / fromIntegral (length kEffsNoGapsHomo)
kEffMeanOnlyStdHomo =
sum kEffsOnlyStdHomo / fromIntegral (length kEffsOnlyStdHomo)
perSiteBS =
if perSiteFlag
then
BL.unlines
[ BL.pack "Effective number of used states per site:",
BL.pack . show $ kEffs
]
else BL.empty
examine :: Bool -> [Seq.Sequence] -> BL.ByteString
examine perSiteFlag ss =
Seq.summarizeSequences ss <> case M.fromSequences ss of
Left _ -> BL.empty
Right a -> BL.pack "\n" <> examineAlignment perSiteFlag a
examineCmd :: ELynx ExamineArguments ()
examineCmd = do
(ExamineArguments al inFile perSiteFlag) <- local <$> ask
$(logInfo) "Command: Examine sequences."
ss <- readSeqs al inFile
let result = examine perSiteFlag ss
out "result of examination" result ".out"