{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}

-- |
--
-- Module      :  Analyze.Analyze
-- Description :  Parse sequence file formats and analyze them
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Fri Oct  5 08:41:05 2018.
module SLynx.Examine.Examine
  ( examineCmd,
  )
where

import Control.Monad.Trans.Reader
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Set as S
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import qualified ELynx.Data.Alphabet.Character as C
import qualified ELynx.Data.Sequence.Alignment as M
import qualified ELynx.Data.Sequence.Distance as D
import qualified ELynx.Data.Sequence.Sequence as Seq
import ELynx.Tools
import SLynx.Examine.Options
import SLynx.Tools
import qualified Statistics.Sample as Sm
import Text.Printf

pRow :: String -> String -> BL.ByteString
pRow :: String -> String -> ByteString
pRow String
name String
val = Int -> ByteString -> ByteString
alignLeft Int
50 ByteString
n ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> Int -> ByteString -> ByteString
alignRight Int
10 ByteString
v
  where
    n :: ByteString
n = String -> ByteString
BL.pack String
name
    v :: ByteString
v = String -> ByteString
BL.pack String
val

examineAlignment :: Bool -> M.Alignment -> BL.ByteString
examineAlignment :: Bool -> Alignment -> ByteString
examineAlignment Bool
perSiteFlag Alignment
a =
  [ByteString] -> ByteString
BL.unlines
    [ String -> ByteString
BL.pack
        String
"Sequences have equal length (multi sequence alignment, or single sequence).",
      String -> ByteString
BL.pack String
"Number of columns in alignment:",
      String -> String -> ByteString
pRow String
"  Total:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
aL,
      String -> String -> ByteString
pRow String
"  Constant:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
nConstant,
      String -> String -> ByteString
pRow String
"  Constant (including gaps or unknowns):" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
nConstantSoft,
      String -> String -> ByteString
pRow String
"  Without gaps:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show (Alignment -> Int
M.length Alignment
aNoGaps),
      String -> String -> ByteString
pRow String
"  With standard characters only:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        Int -> String
forall a. Show a => a -> String
show (Alignment -> Int
M.length Alignment
aOnlyStd),
      ByteString
BL.empty,
      String -> String -> ByteString
pRow String
"Total number of characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
nTot,
      String -> String -> ByteString
pRow String
"Standard (i.e., not extended IUPAC) characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        Int -> String
forall a. Show a => a -> String
show (Int
nTot Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nIUPAC Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nGaps Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nUnknowns),
      String -> String -> ByteString
pRow String
"Extended IUPAC characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
nIUPAC,
      String -> String -> ByteString
pRow String
"Gaps:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
nGaps,
      String -> String -> ByteString
pRow String
"Unknowns:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ Int -> String
forall a. Show a => a -> String
show Int
nUnknowns,
      ByteString
BL.empty,
      String -> String -> ByteString
pRow String
"Percentage of standard characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%2.2f" (Double
100.0 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
percentIUPAC Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
percentGaps Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
percentUnknowns),
      String -> String -> ByteString
pRow String
"Percentage of extended IUPAC characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%2.2f" Double
percentIUPAC,
      String -> String -> ByteString
pRow String
"Percentage of gaps:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%2.2f" Double
percentGaps,
      String -> String -> ByteString
pRow String
"Percentage of unknowns:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%2.2f" Double
percentUnknowns,
      ByteString
BL.empty,
      String -> ByteString
BL.pack String
"Distribution of characters:",
      -- Holy crap.
      String -> ByteString
BL.pack (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        (Character -> String) -> [Character] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((Char -> String -> String
forall a. a -> [a] -> [a]
: String
"     ") (Char -> String) -> (Character -> Char) -> Character -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Character -> Char
C.toChar) ([Character] -> String) -> [Character] -> String
forall a b. (a -> b) -> a -> b
$
          Set Character -> [Character]
forall a. Set a -> [a]
S.toList (Set Character -> [Character]) -> Set Character -> [Character]
forall a b. (a -> b) -> a -> b
$
            AlphabetSpec -> Set Character
A.std (AlphabetSpec -> Set Character) -> AlphabetSpec -> Set Character
forall a b. (a -> b) -> a -> b
$
              Alphabet -> AlphabetSpec
A.alphabetSpec (Alphabet -> AlphabetSpec) -> Alphabet -> AlphabetSpec
forall a b. (a -> b) -> a -> b
$
                Alignment -> Alphabet
M.alphabet Alignment
a,
      String -> ByteString
BL.pack (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ [String] -> String
unwords ([String] -> String) -> [String] -> String
forall a b. (a -> b) -> a -> b
$ (Double -> String) -> [Double] -> [String]
forall a b. (a -> b) -> [a] -> [b]
map (String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f") [Double]
charFreqs,
      ByteString
BL.empty,
      String -> ByteString
BL.pack String
"Pairwise hamming distances (per site):",
      String -> String -> ByteString
pRow String
"  Mean:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
hMean,
      String -> String -> ByteString
pRow String
"  Standard deviation:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" (Double -> String) -> Double -> String
forall a b. (a -> b) -> a -> b
$ Double -> Double
forall a. Floating a => a -> a
sqrt Double
hVar,
      String -> String -> ByteString
pRow String
"  Minimum:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
hMin,
      String -> String -> ByteString
pRow String
"  Maximum:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
hMax,
      ByteString
BL.empty,
      String -> ByteString
BL.pack String
"Mean effective number of states (measured using entropy):",
      String -> String -> ByteString
pRow String
"Across whole alignment:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMean,
      String -> String -> ByteString
pRow String
"Across columns without gaps:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanNoGaps,
      String -> String -> ByteString
pRow String
"Across columns without extended IUPAC characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanOnlyStd,
      ByteString
BL.empty,
      String -> ByteString
BL.pack String
"Mean effective number of states (measured using homoplasy):",
      String -> String -> ByteString
pRow String
"Across whole alignment:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanHomo,
      String -> String -> ByteString
pRow String
"Across columns without gaps:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$ String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanNoGapsHomo,
      String -> String -> ByteString
pRow String
"Across columns without extended IUPAC characters:" (String -> ByteString) -> String -> ByteString
forall a b. (a -> b) -> a -> b
$
        String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.3f" Double
kEffMeanOnlyStdHomo
    ]
    ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> ByteString
perSiteBS
  where
    aL :: Int
aL = Alignment -> Int
M.length Alignment
a
    nConstant :: Int
nConstant = Alignment -> Int
M.length (Alignment -> Int) -> Alignment -> Int
forall a b. (a -> b) -> a -> b
$ Alignment -> Alignment
M.filterColsConstant Alignment
a
    nConstantSoft :: Int
nConstantSoft = Alignment -> Int
M.length (Alignment -> Int) -> Alignment -> Int
forall a b. (a -> b) -> a -> b
$ Alignment -> Alignment
M.filterColsConstantSoft Alignment
a
    nTot :: Int
nTot = Alignment -> Int
M.length Alignment
a Int -> Int -> Int
forall a. Num a => a -> a -> a
* Alignment -> Int
M.nSequences Alignment
a
    nIUPAC :: Int
nIUPAC = Alignment -> Int
M.countIUPACChars Alignment
a
    nGaps :: Int
nGaps = Alignment -> Int
M.countGaps Alignment
a
    nUnknowns :: Int
nUnknowns = Alignment -> Int
M.countUnknowns Alignment
a
    percentIUPAC :: Double
percentIUPAC = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nIUPAC Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nTot :: Double
    percentGaps :: Double
percentGaps = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nGaps Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nTot :: Double
    percentUnknowns :: Double
percentUnknowns = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nUnknowns Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nTot :: Double
    aNoGaps :: Alignment
aNoGaps = Alignment -> Alignment
M.filterColsNoGaps Alignment
a
    aNoGapsFreq :: FrequencyData
aNoGapsFreq = Alignment -> FrequencyData
M.toFrequencyData Alignment
aNoGaps
    aOnlyStd :: Alignment
aOnlyStd = Alignment -> Alignment
M.filterColsOnlyStd Alignment
aNoGaps
    aOnlyStdFreq :: FrequencyData
aOnlyStdFreq = Alignment -> FrequencyData
M.toFrequencyData Alignment
aOnlyStd
    charFreqsPerSite :: FrequencyData
charFreqsPerSite = Alignment -> FrequencyData
M.toFrequencyData Alignment
a
    charFreqs :: [Double]
charFreqs = FrequencyData -> [Double]
M.distribution FrequencyData
charFreqsPerSite
    seqs :: [Sequence]
seqs = Alignment -> [Sequence]
M.toSequences Alignment
a
    normlz :: a -> a
normlz a
x = a -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x a -> a -> a
forall a. Fractional a => a -> a -> a
/ Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
aL
    pairwiseHamming :: Vector Double
pairwiseHamming =
      [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
V.fromList
        [ (String -> Double)
-> (Int -> Double) -> Either String Int -> Double
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either String -> Double
forall a. HasCallStack => String -> a
error Int -> Double
forall a a. (Fractional a, Integral a) => a -> a
normlz (Either String Int -> Double) -> Either String Int -> Double
forall a b. (a -> b) -> a -> b
$ Sequence -> Sequence -> Either String Int
D.hamming Sequence
x Sequence
y
          | Sequence
x <- [Sequence]
seqs,
            Sequence
y <- [Sequence]
seqs,
            Sequence
x Sequence -> Sequence -> Bool
forall a. Eq a => a -> a -> Bool
/= Sequence
y
        ]
    (Double
hMean, Double
hVar) = Vector Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
Sm.meanVariance Vector Double
pairwiseHamming
    hMin :: Double
hMin = Vector Double -> Double
forall a. (Unbox a, Ord a) => Vector a -> a
V.minimum Vector Double
pairwiseHamming
    hMax :: Double
hMax = Vector Double -> Double
forall a. (Unbox a, Ord a) => Vector a -> a
V.maximum Vector Double
pairwiseHamming
    kEffs :: [Double]
kEffs = FrequencyData -> [Double]
M.kEffEntropy FrequencyData
charFreqsPerSite
    kEffsNoGaps :: [Double]
kEffsNoGaps = FrequencyData -> [Double]
M.kEffEntropy FrequencyData
aNoGapsFreq
    kEffsOnlyStd :: [Double]
kEffsOnlyStd = FrequencyData -> [Double]
M.kEffEntropy FrequencyData
aOnlyStdFreq
    kEffMean :: Double
kEffMean = [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffs Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffs)
    kEffMeanNoGaps :: Double
kEffMeanNoGaps = [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsNoGaps Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsNoGaps)
    kEffMeanOnlyStd :: Double
kEffMeanOnlyStd = [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsOnlyStd Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsOnlyStd)
    kEffsHomo :: [Double]
kEffsHomo = FrequencyData -> [Double]
M.kEffHomoplasy FrequencyData
charFreqsPerSite
    kEffsNoGapsHomo :: [Double]
kEffsNoGapsHomo = FrequencyData -> [Double]
M.kEffHomoplasy FrequencyData
aNoGapsFreq
    kEffsOnlyStdHomo :: [Double]
kEffsOnlyStdHomo = FrequencyData -> [Double]
M.kEffHomoplasy FrequencyData
aOnlyStdFreq
    kEffMeanHomo :: Double
kEffMeanHomo = [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsHomo Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsHomo)
    kEffMeanNoGapsHomo :: Double
kEffMeanNoGapsHomo =
      [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsNoGapsHomo Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsNoGapsHomo)
    kEffMeanOnlyStdHomo :: Double
kEffMeanOnlyStdHomo =
      [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
kEffsOnlyStdHomo Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
kEffsOnlyStdHomo)
    perSiteBS :: ByteString
perSiteBS =
      if Bool
perSiteFlag
        then
          [ByteString] -> ByteString
BL.unlines
            [ String -> ByteString
BL.pack String
"Effective number of used states per site (measured using entropy):",
              String -> ByteString
BL.pack (String -> ByteString)
-> ([Double] -> String) -> [Double] -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Double] -> String
forall a. Show a => a -> String
show ([Double] -> ByteString) -> [Double] -> ByteString
forall a b. (a -> b) -> a -> b
$ [Double]
kEffs
            ]
        else ByteString
BL.empty

examine :: Bool -> [Seq.Sequence] -> BL.ByteString
examine :: Bool -> [Sequence] -> ByteString
examine Bool
perSiteFlag [Sequence]
ss =
  [Sequence] -> ByteString
Seq.summarizeSequences [Sequence]
ss ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> case [Sequence] -> Either String Alignment
M.fromSequences [Sequence]
ss of
    Left String
_ -> ByteString
BL.empty
    Right Alignment
a -> String -> ByteString
BL.pack String
"\n" ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> Bool -> Alignment -> ByteString
examineAlignment Bool
perSiteFlag Alignment
a

-- | Examine sequences.
examineCmd :: ELynx ExamineArguments ()
examineCmd :: ELynx ExamineArguments ()
examineCmd = do
  (ExamineArguments Alphabet
al String
inFile Bool
perSiteFlag) <- Environment ExamineArguments -> ExamineArguments
forall a. Environment a -> a
localArguments (Environment ExamineArguments -> ExamineArguments)
-> ReaderT
     (Environment ExamineArguments) IO (Environment ExamineArguments)
-> ReaderT (Environment ExamineArguments) IO ExamineArguments
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ReaderT
  (Environment ExamineArguments) IO (Environment ExamineArguments)
forall (m :: * -> *) r. Monad m => ReaderT r m r
ask
  [Sequence]
ss <- Alphabet
-> String -> Logger (Environment ExamineArguments) [Sequence]
forall e.
(HasLock e, HasLogHandles e, HasVerbosity e) =>
Alphabet -> String -> Logger e [Sequence]
readSeqs Alphabet
al String
inFile
  let result :: ByteString
result = Bool -> [Sequence] -> ByteString
examine Bool
perSiteFlag [Sequence]
ss
  String -> ByteString -> String -> ELynx ExamineArguments ()
forall a.
Reproducible a =>
String -> ByteString -> String -> ELynx a ()
out String
"result of examination" ByteString
result String
".out"