-- |
-- Module      :  ELynx.Data.Sequence.Alignment
-- Description :  Multi sequence alignment related types and functions
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
--
-- Portability :  portable
--
-- Creation date: Thu Oct  4 18:40:18 2018.
--
-- This module is to be imported qualified.
module ELynx.Data.Sequence.Alignment
  ( Alignment (..),
    length,
    nSequences,
    -- | * Input, output
    fromSequences,
    toSequences,
    summarize,
    -- | * Manipulation
    join,
    concat,
    concatAlignments,
    filterColsConstant,
    filterColsConstantSoft,
    filterColsOnlyStd,
    filterColsStd,
    filterColsNoGaps,
    -- | * Analysis
    FrequencyData,
    distribution,
    toFrequencyData,
    kEffEntropy,
    kEffHomoplasy,
    countIUPACChars,
    countGaps,
    countUnknowns,
    -- | * Sub sample
    subSample,
    randomSubSample,
  )
where

import Control.Monad hiding (join)
import Control.Monad.Primitive
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List hiding
  ( concat,
    length,
  )
import qualified Data.Matrix.Unboxed as M
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import qualified ELynx.Data.Alphabet.DistributionDiversity as D
import ELynx.Data.Sequence.Defaults
import qualified ELynx.Data.Sequence.Sequence as S
import System.Random.MWC
import Prelude hiding
  ( concat,
    length,
  )

-- | A collection of sequences.
data Alignment = Alignment
  { Alignment -> [Name]
names :: [S.Name],
    Alignment -> [Name]
descriptions :: [S.Description],
    Alignment -> Alphabet
alphabet :: A.Alphabet,
    Alignment -> Matrix Character
matrix :: M.Matrix Character
  }
  deriving (Int -> Alignment -> ShowS
[Alignment] -> ShowS
Alignment -> String
(Int -> Alignment -> ShowS)
-> (Alignment -> String)
-> ([Alignment] -> ShowS)
-> Show Alignment
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Alignment] -> ShowS
$cshowList :: [Alignment] -> ShowS
show :: Alignment -> String
$cshow :: Alignment -> String
showsPrec :: Int -> Alignment -> ShowS
$cshowsPrec :: Int -> Alignment -> ShowS
Show, Alignment -> Alignment -> Bool
(Alignment -> Alignment -> Bool)
-> (Alignment -> Alignment -> Bool) -> Eq Alignment
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Alignment -> Alignment -> Bool
$c/= :: Alignment -> Alignment -> Bool
== :: Alignment -> Alignment -> Bool
$c== :: Alignment -> Alignment -> Bool
Eq)

-- | Number of sites.
length :: Alignment -> Int
length :: Alignment -> Int
length = Matrix Character -> Int
forall a. Context a => Matrix a -> Int
M.cols (Matrix Character -> Int)
-> (Alignment -> Matrix Character) -> Alignment -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alignment -> Matrix Character
matrix

-- | Number of sequences.
nSequences :: Alignment -> Int
nSequences :: Alignment -> Int
nSequences = Matrix Character -> Int
forall a. Context a => Matrix a -> Int
M.rows (Matrix Character -> Int)
-> (Alignment -> Matrix Character) -> Alignment -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alignment -> Matrix Character
matrix

-- | Create 'Alignment' from a list of 'S.Sequence's.
fromSequences :: [S.Sequence] -> Either String Alignment
fromSequences :: [Sequence] -> Either String Alignment
fromSequences [Sequence]
ss
  | [Sequence] -> Bool
S.equalLength [Sequence]
ss Bool -> Bool -> Bool
&& [Alphabet] -> Bool
forall a. Eq a => [a] -> Bool
allEqual ((Sequence -> Alphabet) -> [Sequence] -> [Alphabet]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Alphabet
S.alphabet [Sequence]
ss) =
    Alignment -> Either String Alignment
forall a b. b -> Either a b
Right (Alignment -> Either String Alignment)
-> Alignment -> Either String Alignment
forall a b. (a -> b) -> a -> b
$
      [Name] -> [Name] -> Alphabet -> Matrix Character -> Alignment
Alignment [Name]
ns [Name]
ds Alphabet
a Matrix Character
d
  | [Sequence] -> Bool
S.equalLength [Sequence]
ss = String -> Either String Alignment
forall a b. a -> Either a b
Left String
"Sequences do not have equal codes."
  | Bool
otherwise = String -> Either String Alignment
forall a b. a -> Either a b
Left String
"Sequences do not have equal lengths."
  where
    ns :: [Name]
ns = (Sequence -> Name) -> [Sequence] -> [Name]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Name
S.name [Sequence]
ss
    ds :: [Name]
ds = (Sequence -> Name) -> [Sequence] -> [Name]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Name
S.description [Sequence]
ss
    a :: Alphabet
a = Sequence -> Alphabet
S.alphabet (Sequence -> Alphabet) -> Sequence -> Alphabet
forall a b. (a -> b) -> a -> b
$ [Sequence] -> Sequence
forall a. [a] -> a
head [Sequence]
ss
    bss :: [Characters]
bss = (Sequence -> Characters) -> [Sequence] -> [Characters]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Characters
S.characters [Sequence]
ss
    d :: Matrix Character
d = [Characters] -> Matrix Character
forall a. Context a => [Vector a] -> Matrix a
M.fromRows [Characters]
bss
    allEqual :: [a] -> Bool
allEqual [] = Bool
True
    allEqual [a]
xs = (a -> Bool) -> [a] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== [a] -> a
forall a. [a] -> a
head [a]
xs) ([a] -> Bool) -> [a] -> Bool
forall a b. (a -> b) -> a -> b
$ [a] -> [a]
forall a. [a] -> [a]
tail [a]
xs

-- | Conversion to list of 'S.Sequence's.
toSequences :: Alignment -> [S.Sequence]
toSequences :: Alignment -> [Sequence]
toSequences (Alignment [Name]
ns [Name]
ds Alphabet
a Matrix Character
da) =
  (Name -> Name -> Characters -> Sequence)
-> [Name] -> [Name] -> [Characters] -> [Sequence]
forall a b c d. (a -> b -> c -> d) -> [a] -> [b] -> [c] -> [d]
zipWith3
    (\Name
n Name
d Characters
r -> Name -> Name -> Alphabet -> Characters -> Sequence
S.Sequence Name
n Name
d Alphabet
a Characters
r)
    [Name]
ns
    [Name]
ds
    [Characters]
rows
  where
    rows :: [Characters]
rows = Matrix Character -> [Characters]
forall a. Context a => Matrix a -> [Vector a]
M.toRows Matrix Character
da

header :: Alignment -> BL.ByteString
header :: Alignment -> Name
header Alignment
a =
  [Name] -> Name
BL.unlines ([Name] -> Name) -> [Name] -> Name
forall a b. (a -> b) -> a -> b
$
    [ String -> Name
BL.pack String
"Multi sequence alignment.",
      String -> Name
BL.pack (String -> Name) -> String -> Name
forall a b. (a -> b) -> a -> b
$ String
"Code: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Alphabet -> String
A.alphabetDescription (Alignment -> Alphabet
alphabet Alignment
a) String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
".",
      String -> Name
BL.pack (String -> Name) -> String -> Name
forall a b. (a -> b) -> a -> b
$ String
"Length: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show (Alignment -> Int
length Alignment
a) String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"."
    ]
      [Name] -> [Name] -> [Name]
forall a. [a] -> [a] -> [a]
++ [Name]
reportLengthSummary
      [Name] -> [Name] -> [Name]
forall a. [a] -> [a] -> [a]
++ [Name]
reportNumberSummary
  where
    reportLengthSummary :: [Name]
reportLengthSummary =
      [ String -> Name
BL.pack (String -> Name) -> String -> Name
forall a b. (a -> b) -> a -> b
$
          String
"For each sequence, the "
            String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
summaryLength
            String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" first bases are shown."
        | Alignment -> Int
length Alignment
a Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
summaryLength
      ]
    reportNumberSummary :: [Name]
reportNumberSummary =
      [ String -> Name
BL.pack (String -> Name) -> String -> Name
forall a b. (a -> b) -> a -> b
$
          Int -> String
forall a. Show a => a -> String
show Int
summaryNSequences
            String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" out of "
            String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show (Alignment -> Int
nSequences Alignment
a)
            String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
" sequences are shown."
        | Alignment -> Int
nSequences Alignment
a Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
summaryNSequences
      ]

-- | Similar to 'S.summarizeSequenceList' but with different Header.
summarize :: Alignment -> BL.ByteString
summarize :: Alignment -> Name
summarize Alignment
a = Alignment -> Name
header Alignment
a Name -> Name -> Name
forall a. Semigroup a => a -> a -> a
<> [Sequence] -> Name
S.body (Alignment -> [Sequence]
toSequences Alignment
a)

-- Vertical concatenation.
(===) :: V.Unbox a => M.Matrix a -> M.Matrix a -> M.Matrix a
=== :: Matrix a -> Matrix a -> Matrix a
(===) Matrix a
l Matrix a
r = [Vector a] -> Matrix a
forall a. Context a => [Vector a] -> Matrix a
M.fromRows ([Vector a] -> Matrix a) -> [Vector a] -> Matrix a
forall a b. (a -> b) -> a -> b
$ [Vector a]
lRs [Vector a] -> [Vector a] -> [Vector a]
forall a. [a] -> [a] -> [a]
++ [Vector a]
rRs
  where
    lRs :: [Vector a]
lRs = Matrix a -> [Vector a]
forall a. Context a => Matrix a -> [Vector a]
M.toRows Matrix a
l
    rRs :: [Vector a]
rRs = Matrix a -> [Vector a]
forall a. Context a => Matrix a -> [Vector a]
M.toRows Matrix a
r

-- Horizontal concatenation.
(|||) :: V.Unbox a => M.Matrix a -> M.Matrix a -> M.Matrix a
||| :: Matrix a -> Matrix a -> Matrix a
(|||) Matrix a
l Matrix a
r = [Vector a] -> Matrix a
forall a. Context a => [Vector a] -> Matrix a
M.fromColumns ([Vector a] -> Matrix a) -> [Vector a] -> Matrix a
forall a b. (a -> b) -> a -> b
$ [Vector a]
lCs [Vector a] -> [Vector a] -> [Vector a]
forall a. [a] -> [a] -> [a]
++ [Vector a]
rCs
  where
    lCs :: [Vector a]
lCs = Matrix a -> [Vector a]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns Matrix a
l
    rCs :: [Vector a]
rCs = Matrix a -> [Vector a]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns Matrix a
r

-- | Join two 'Alignment's vertically. That is, add more sequences
-- to an alignment. See also 'concat'.
join :: Alignment -> Alignment -> Alignment
-- top bottom.
join :: Alignment -> Alignment -> Alignment
join Alignment
t Alignment
b
  | Alignment -> Int
length Alignment
t Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Alignment -> Int
length Alignment
b =
    String -> Alignment
forall a. HasCallStack => String -> a
error
      String
"join: Multi sequence alignments do not have equal lengths."
  | Alignment -> Alphabet
alphabet Alignment
t Alphabet -> Alphabet -> Bool
forall a. Eq a => a -> a -> Bool
/= Alignment -> Alphabet
alphabet Alignment
b =
    String -> Alignment
forall a. HasCallStack => String -> a
error
      String
"join: Multi sequence alignments do not have equal alphabets."
  | Bool
otherwise = [Name] -> [Name] -> Alphabet -> Matrix Character -> Alignment
Alignment [Name]
ns [Name]
ds Alphabet
al (Matrix Character
tD Matrix Character -> Matrix Character -> Matrix Character
forall a. Unbox a => Matrix a -> Matrix a -> Matrix a
=== Matrix Character
bD)
  where
    ns :: [Name]
ns = Alignment -> [Name]
names Alignment
t [Name] -> [Name] -> [Name]
forall a. [a] -> [a] -> [a]
++ Alignment -> [Name]
names Alignment
b
    ds :: [Name]
ds = Alignment -> [Name]
descriptions Alignment
t [Name] -> [Name] -> [Name]
forall a. [a] -> [a] -> [a]
++ Alignment -> [Name]
descriptions Alignment
b
    tD :: Matrix Character
tD = Alignment -> Matrix Character
matrix Alignment
t
    bD :: Matrix Character
bD = Alignment -> Matrix Character
matrix Alignment
b
    al :: Alphabet
al = Alignment -> Alphabet
alphabet Alignment
t

-- | Concatenate two 'Alignment's horizontally. That is, add more
-- sites to an alignment. See also 'join'.
concat :: Alignment -> Alignment -> Alignment
-- left right.
concat :: Alignment -> Alignment -> Alignment
concat Alignment
l Alignment
r
  | Alignment -> Int
nSequences Alignment
l Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Alignment -> Int
nSequences Alignment
r =
    String -> Alignment
forall a. HasCallStack => String -> a
error
      String
"concat: Multi sequence alignments do not have an equal number of sequences."
  | Alignment -> Alphabet
alphabet Alignment
l Alphabet -> Alphabet -> Bool
forall a. Eq a => a -> a -> Bool
/= Alignment -> Alphabet
alphabet Alignment
r =
    String -> Alignment
forall a. HasCallStack => String -> a
error String
"concat: Multi sequence alignments do not have an equal alphabets."
  | Alignment -> [Name]
names Alignment
l [Name] -> [Name] -> Bool
forall a. Eq a => a -> a -> Bool
/= Alignment -> [Name]
names Alignment
r =
    String -> Alignment
forall a. HasCallStack => String -> a
error String
"concat: Multi sequence alignments do not have an equal names."
  | Alignment -> [Name]
descriptions Alignment
l [Name] -> [Name] -> Bool
forall a. Eq a => a -> a -> Bool
/= Alignment -> [Name]
descriptions Alignment
r =
    String -> Alignment
forall a. HasCallStack => String -> a
error String
"concat: Multi sequence alignments do not have an equal descriptions."
  | Bool
otherwise =
    [Name] -> [Name] -> Alphabet -> Matrix Character -> Alignment
Alignment (Alignment -> [Name]
names Alignment
l) (Alignment -> [Name]
descriptions Alignment
l) (Alignment -> Alphabet
alphabet Alignment
l) (Matrix Character
lD Matrix Character -> Matrix Character -> Matrix Character
forall a. Unbox a => Matrix a -> Matrix a -> Matrix a
||| Matrix Character
rD)
  where
    lD :: Matrix Character
lD = Alignment -> Matrix Character
matrix Alignment
l
    rD :: Matrix Character
rD = Alignment -> Matrix Character
matrix Alignment
r

-- | Concatenate a list of 'Alignment's horizontally. See
-- 'concat'.
concatAlignments :: [Alignment] -> Alignment
concatAlignments :: [Alignment] -> Alignment
concatAlignments [] = String -> Alignment
forall a. HasCallStack => String -> a
error String
"concatAlignments: Nothing to concatenate."
concatAlignments [Alignment
a] = Alignment
a
concatAlignments [Alignment]
as = (Alignment -> Alignment -> Alignment)
-> Alignment -> [Alignment] -> Alignment
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' Alignment -> Alignment -> Alignment
concat ([Alignment] -> Alignment
forall a. [a] -> a
head [Alignment]
as) ([Alignment] -> [Alignment]
forall a. [a] -> [a]
tail [Alignment]
as)

-- Only keep columns from alignment that satisfy given predicate.
filterColsWith :: (V.Vector Character -> Bool) -> Alignment -> Alignment
filterColsWith :: (Characters -> Bool) -> Alignment -> Alignment
filterColsWith Characters -> Bool
p Alignment
a = Alignment
a {matrix :: Matrix Character
matrix = Matrix Character
m'}
  where
    m' :: Matrix Character
m' = [Characters] -> Matrix Character
forall a. Context a => [Vector a] -> Matrix a
M.fromColumns ([Characters] -> Matrix Character)
-> (Matrix Character -> [Characters])
-> Matrix Character
-> Matrix Character
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Characters -> Bool) -> [Characters] -> [Characters]
forall a. (a -> Bool) -> [a] -> [a]
filter Characters -> Bool
p ([Characters] -> [Characters])
-> (Matrix Character -> [Characters])
-> Matrix Character
-> [Characters]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Matrix Character -> [Characters]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns (Matrix Character -> Matrix Character)
-> Matrix Character -> Matrix Character
forall a b. (a -> b) -> a -> b
$ Alignment -> Matrix Character
matrix Alignment
a

-- | Only keep constant columns.
filterColsConstant :: Alignment -> Alignment
filterColsConstant :: Alignment -> Alignment
filterColsConstant = (Characters -> Bool) -> Alignment -> Alignment
filterColsWith (\Characters
v -> (Character -> Bool) -> Characters -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.all (Character -> Character -> Bool
forall a. Eq a => a -> a -> Bool
== Characters -> Character
forall a. Unbox a => Vector a -> a
V.head Characters
v) Characters
v)

-- | Only keep constant columns, and constant columns with at least one standard
-- character as well as any number of gaps or unknowns.
filterColsConstantSoft :: Alignment -> Alignment
filterColsConstantSoft :: Alignment -> Alignment
filterColsConstantSoft Alignment
a = (Characters -> Bool) -> Alignment -> Alignment
filterColsWith Characters -> Bool
f Alignment
a
  where
    al :: Alphabet
al = Alignment -> Alphabet
alphabet Alignment
a
    f :: Characters -> Bool
f Characters
v = case (Character -> Bool) -> Characters -> Maybe Character
forall a. Unbox a => (a -> Bool) -> Vector a -> Maybe a
V.find (Alphabet -> Character -> Bool
A.isStd Alphabet
al) Characters
v of
      Maybe Character
Nothing -> Bool
False
      Just Character
c -> (Character -> Bool) -> Characters -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.all (\Character
x -> Character
x Character -> Character -> Bool
forall a. Eq a => a -> a -> Bool
== Character
c Bool -> Bool -> Bool
|| Alphabet -> Character -> Bool
A.isGap Alphabet
al Character
x Bool -> Bool -> Bool
|| Alphabet -> Character -> Bool
A.isUnknown Alphabet
al Character
x) Characters
v

-- | Only keep columns with standard characters. Alignment columns with IUPAC
-- characters are removed.
filterColsOnlyStd :: Alignment -> Alignment
filterColsOnlyStd :: Alignment -> Alignment
filterColsOnlyStd Alignment
a = (Characters -> Bool) -> Alignment -> Alignment
filterColsWith ((Character -> Bool) -> Characters -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.all ((Character -> Bool) -> Characters -> Bool)
-> (Character -> Bool) -> Characters -> Bool
forall a b. (a -> b) -> a -> b
$ Alphabet -> Character -> Bool
A.isStd (Alignment -> Alphabet
alphabet Alignment
a)) Alignment
a

-- | Filter columns with proportion of standard character larger than given number.
filterColsStd :: Double -> Alignment -> Alignment
filterColsStd :: Double -> Alignment -> Alignment
filterColsStd Double
prop Alignment
a =
  (Characters -> Bool) -> Alignment -> Alignment
filterColsWith
    (\Characters
col -> Double
prop Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
n Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length ((Character -> Bool) -> Characters -> Characters
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isStd Alphabet
al) Characters
col)))
    Alignment
a
  where
    al :: Alphabet
al = Alignment -> Alphabet
alphabet Alignment
a
    n :: Double
n = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ Alignment -> Int
nSequences Alignment
a

-- | Only keep columns without gaps or unknown characters.
filterColsNoGaps :: Alignment -> Alignment
filterColsNoGaps :: Alignment -> Alignment
filterColsNoGaps Alignment
a = (Characters -> Bool) -> Alignment -> Alignment
filterColsWith ((Character -> Bool) -> Characters -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.all ((Character -> Bool) -> Characters -> Bool)
-> (Character -> Bool) -> Characters -> Bool
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Bool -> Bool) -> (Character -> Bool) -> Character -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alphabet -> Character -> Bool
A.isGap (Alignment -> Alphabet
alphabet Alignment
a)) Alignment
a

-- | Frequency data; do not store the actual characters, but their frequencies.
-- The matrix is of size @N x K@, where @N@ is the number of sites, and @K@ is
-- the number of characters.
type FrequencyData = M.Matrix Double

-- Map a function on each column of a DIM2 array; parallel version with given chunk size.
fMapColParChunk ::
  (V.Unbox a, V.Unbox b) =>
  Int ->
  (V.Vector a -> V.Vector b) ->
  M.Matrix a ->
  M.Matrix b
fMapColParChunk :: Int -> (Vector a -> Vector b) -> Matrix a -> Matrix b
fMapColParChunk Int
n Vector a -> Vector b
f Matrix a
m =
  [Vector b] -> Matrix b
forall a. Context a => [Vector a] -> Matrix a
M.fromColumns ((Vector a -> Vector b) -> [Vector a] -> [Vector b]
forall a b. (a -> b) -> [a] -> [b]
map Vector a -> Vector b
f (Matrix a -> [Vector a]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns Matrix a
m) [Vector b] -> Strategy [Vector b] -> [Vector b]
forall a. a -> Strategy a -> a
`using` Int -> Strategy (Vector b) -> Strategy [Vector b]
forall a. Int -> Strategy a -> Strategy [a]
parListChunk Int
n Strategy (Vector b)
forall a. Strategy a
rseq)

-- | Calculcate frequency of characters at each site of a multi sequence alignment.
toFrequencyData :: Alignment -> FrequencyData
toFrequencyData :: Alignment -> FrequencyData
toFrequencyData Alignment
a = Int
-> (Characters -> Vector Double)
-> Matrix Character
-> FrequencyData
forall a b.
(Unbox a, Unbox b) =>
Int -> (Vector a -> Vector b) -> Matrix a -> Matrix b
fMapColParChunk Int
100 (AlphabetSpec -> Characters -> Vector Double
forall (v :: * -> *).
(Vector v Character, Vector v Int, Vector v Double) =>
AlphabetSpec -> v Character -> v Double
D.frequencyCharacters AlphabetSpec
spec) (Alignment -> Matrix Character
matrix Alignment
a)
  where
    spec :: AlphabetSpec
spec = Alphabet -> AlphabetSpec
A.alphabetSpec (Alignment -> Alphabet
alphabet Alignment
a)

-- | Calculate the distribution of characters.
distribution :: FrequencyData -> [Double]
distribution :: FrequencyData -> [Double]
distribution FrequencyData
fd =
  (Double -> Double) -> [Double] -> [Double]
forall a b. (a -> b) -> [a] -> [b]
map (Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nSites) ([Double] -> [Double]) -> [Double] -> [Double]
forall a b. (a -> b) -> a -> b
$
    Vector Double -> [Double]
forall a. Unbox a => Vector a -> [a]
V.toList (Vector Double -> [Double]) -> Vector Double -> [Double]
forall a b. (a -> b) -> a -> b
$
      (Vector Double -> Vector Double -> Vector Double)
-> [Vector Double] -> Vector Double
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1
        ((Double -> Double -> Double)
-> Vector Double -> Vector Double -> Vector Double
forall a b c.
(Unbox a, Unbox b, Unbox c) =>
(a -> b -> c) -> Vector a -> Vector b -> Vector c
V.zipWith Double -> Double -> Double
forall a. Num a => a -> a -> a
(+))
        (FrequencyData -> [Vector Double]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns FrequencyData
fd)
  where
    nSites :: Int
nSites = FrequencyData -> Int
forall a. Context a => Matrix a -> Int
M.cols FrequencyData
fd

-- Parallel map with given chunk size.
parMapChunk :: Int -> (a -> b) -> [a] -> [b]
parMapChunk :: Int -> (a -> b) -> [a] -> [b]
parMapChunk Int
n a -> b
f [a]
as = (a -> b) -> [a] -> [b]
forall a b. (a -> b) -> [a] -> [b]
map a -> b
f [a]
as [b] -> Strategy [b] -> [b]
forall a. a -> Strategy a -> a
`using` Int -> Strategy b -> Strategy [b]
forall a. Int -> Strategy a -> Strategy [a]
parListChunk Int
n Strategy b
forall a. Strategy a
rseq

chunksize :: Int
chunksize :: Int
chunksize = Int
500

-- | Diversity analysis. See 'kEffEntropy'.
kEffEntropy :: FrequencyData -> [Double]
kEffEntropy :: FrequencyData -> [Double]
kEffEntropy FrequencyData
fd = Int -> (Vector Double -> Double) -> [Vector Double] -> [Double]
forall a b. Int -> (a -> b) -> [a] -> [b]
parMapChunk Int
chunksize Vector Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
D.kEffEntropy (FrequencyData -> [Vector Double]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns FrequencyData
fd)

-- | Diversity analysis. See 'kEffEntropy'.
kEffHomoplasy :: FrequencyData -> [Double]
kEffHomoplasy :: FrequencyData -> [Double]
kEffHomoplasy FrequencyData
fd = Int -> (Vector Double -> Double) -> [Vector Double] -> [Double]
forall a b. Int -> (a -> b) -> [a] -> [b]
parMapChunk Int
chunksize Vector Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
D.kEffHomoplasy (FrequencyData -> [Vector Double]
forall a. Context a => Matrix a -> [Vector a]
M.toColumns FrequencyData
fd)

-- | Count the number of standard (i.e., not extended IUPAC) characters in the
-- alignment.
countIUPACChars :: Alignment -> Int
countIUPACChars :: Alignment -> Int
countIUPACChars Alignment
a = Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int)
-> (Characters -> Characters) -> Characters -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Bool) -> Characters -> Characters
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isIUPAC (Alignment -> Alphabet
alphabet Alignment
a)) (Characters -> Int) -> Characters -> Int
forall a b. (a -> b) -> a -> b
$ Characters
allChars
  where
    allChars :: Characters
allChars = Matrix Character -> Characters
forall a. Context a => Matrix a -> Vector a
M.flatten (Matrix Character -> Characters) -> Matrix Character -> Characters
forall a b. (a -> b) -> a -> b
$ Alignment -> Matrix Character
matrix Alignment
a

-- | Count the number of gaps in the alignment.
countGaps :: Alignment -> Int
countGaps :: Alignment -> Int
countGaps Alignment
a = Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int)
-> (Characters -> Characters) -> Characters -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Bool) -> Characters -> Characters
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isGap (Alignment -> Alphabet
alphabet Alignment
a)) (Characters -> Int) -> Characters -> Int
forall a b. (a -> b) -> a -> b
$ Characters
allChars
  where
    allChars :: Characters
allChars = Matrix Character -> Characters
forall a. Context a => Matrix a -> Vector a
M.flatten (Matrix Character -> Characters) -> Matrix Character -> Characters
forall a b. (a -> b) -> a -> b
$ Alignment -> Matrix Character
matrix Alignment
a

-- | Count the number of unknown characters in the alignment.
countUnknowns :: Alignment -> Int
countUnknowns :: Alignment -> Int
countUnknowns Alignment
a = Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int)
-> (Characters -> Characters) -> Characters -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Bool) -> Characters -> Characters
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isUnknown (Alignment -> Alphabet
alphabet Alignment
a)) (Characters -> Int) -> Characters -> Int
forall a b. (a -> b) -> a -> b
$ Characters
allChars
  where
    allChars :: Characters
allChars = Matrix Character -> Characters
forall a. Context a => Matrix a -> Vector a
M.flatten (Matrix Character -> Characters) -> Matrix Character -> Characters
forall a b. (a -> b) -> a -> b
$ Alignment -> Matrix Character
matrix Alignment
a

-- Sample the given sites from a matrix.
subSampleMatrix :: V.Unbox a => [Int] -> M.Matrix a -> M.Matrix a
subSampleMatrix :: [Int] -> Matrix a -> Matrix a
subSampleMatrix [Int]
is Matrix a
m =
  [Vector a] -> Matrix a
forall a. Context a => [Vector a] -> Matrix a
M.fromColumns ([Vector a] -> Matrix a) -> [Vector a] -> Matrix a
forall a b. (a -> b) -> a -> b
$ ([Vector a] -> Int -> [Vector a])
-> [Vector a] -> [Int] -> [Vector a]
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\[Vector a]
a Int
i -> Matrix a -> Int -> Vector a
forall a. Context a => Matrix a -> Int -> Vector a
M.takeColumn Matrix a
m Int
i Vector a -> [Vector a] -> [Vector a]
forall a. a -> [a] -> [a]
: [Vector a]
a) [] ([Int] -> [Int]
forall a. [a] -> [a]
reverse [Int]
is)

-- | Sample the given sites from a multi sequence alignment.
subSample :: [Int] -> Alignment -> Alignment
subSample :: [Int] -> Alignment -> Alignment
subSample [Int]
is Alignment
a = Alignment
a {matrix :: Matrix Character
matrix = Matrix Character
m'} where m' :: Matrix Character
m' = [Int] -> Matrix Character -> Matrix Character
forall a. Unbox a => [Int] -> Matrix a -> Matrix a
subSampleMatrix [Int]
is (Matrix Character -> Matrix Character)
-> Matrix Character -> Matrix Character
forall a b. (a -> b) -> a -> b
$ Alignment -> Matrix Character
matrix Alignment
a

-- | Randomly sample a given number of sites of the multi sequence alignment.
randomSubSample ::
  PrimMonad m => Int -> Alignment -> Gen (PrimState m) -> m Alignment
randomSubSample :: Int -> Alignment -> Gen (PrimState m) -> m Alignment
randomSubSample Int
n Alignment
a Gen (PrimState m)
g = do
  let l :: Int
l = Alignment -> Int
length Alignment
a
  [Int]
is <- Int -> m Int -> m [Int]
forall (m :: * -> *) a. Applicative m => Int -> m a -> m [a]
replicateM Int
n (m Int -> m [Int]) -> m Int -> m [Int]
forall a b. (a -> b) -> a -> b
$ (Int, Int) -> Gen (PrimState m) -> m Int
forall a (m :: * -> *).
(Variate a, PrimMonad m) =>
(a, a) -> Gen (PrimState m) -> m a
uniformR (Int
0, Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Gen (PrimState m)
g
  Alignment -> m Alignment
forall (m :: * -> *) a. Monad m => a -> m a
return (Alignment -> m Alignment) -> Alignment -> m Alignment
forall a b. (a -> b) -> a -> b
$ [Int] -> Alignment -> Alignment
subSample [Int]
is Alignment
a