module ELynx.Data.Sequence.Alignment
( Alignment (..)
, length
, nSequences
, fromSequences
, toSequences
, summarize
, join
, concat
, concatAlignments
, filterColsOnlyStd
, filterColsStd
, filterColsNoGaps
, FrequencyData
, distribution
, toFrequencyData
, kEffEntropy
, kEffHomoplasy
, countIUPACChars
, countGaps
, countUnknowns
, subSample
, randomSubSample
) where
import Control.Monad hiding (join)
import Control.Monad.Primitive
import qualified Data.ByteString.Lazy.Char8 as L
import Data.List hiding (concat,
length)
import qualified Data.Matrix.Unboxed as M
import qualified Data.Vector.Unboxed as V
import Prelude hiding (concat,
length)
import System.Random.MWC
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import qualified ELynx.Data.Alphabet.DistributionDiversity as D
import ELynx.Data.Sequence.Defaults
import qualified ELynx.Data.Sequence.Sequence as S
import ELynx.Tools.Concurrent
import ELynx.Tools.Definitions
import ELynx.Tools.Equality
import ELynx.Tools.Matrix
data Alignment = Alignment
{ names :: [S.Name]
, alphabet :: A.Alphabet
, matrix :: M.Matrix Character
}
deriving (Show, Eq)
length :: Alignment -> Int
length = M.cols . matrix
nSequences :: Alignment -> Int
nSequences = M.rows . matrix
fromSequences :: [S.Sequence] -> Either String Alignment
fromSequences ss
| S.equalLength ss && allEqual (map S.alphabet ss) =
Right $ Alignment ns a d
| S.equalLength ss =
Left "Sequences do not have equal codes."
| otherwise =
Left "Sequences do not have equal lengths."
where
ns = map S.name ss
a = S.alphabet $ head ss
bss = map S.characters ss
d = M.fromRows bss
toSequences :: Alignment -> [S.Sequence]
toSequences (Alignment ns a d) = zipWith (\n r -> S.Sequence n a r) ns rows
where
rows = M.toRows d
header :: Alignment -> L.ByteString
header a = L.unlines $
[ L.pack "Multi sequence alignment."
, L.pack $ "Code: " ++ A.description (alphabet a) ++ "."
, L.pack $ "Length: " ++ show (length a) ++ "." ]
++ reportLengthSummary ++ reportNumberSummary
where reportLengthSummary =
[ L.pack $ "For each sequence, the "
++ show summaryLength ++ " first bases are shown."
| length a > summaryLength ]
reportNumberSummary =
[ L.pack $ show summaryNSequences ++ " out of " ++
show (nSequences a) ++ " sequences are shown."
| nSequences a > summaryNSequences ]
summarize :: Alignment -> L.ByteString
summarize a = header a <> S.body (toSequences a)
join :: Alignment
-> Alignment
-> Alignment
join t b
| length t == length b &&
al == alphabet b
= Alignment ns al (tD === bD)
| otherwise = error "join: Multi sequence alignments do not have equal length."
where
ns = names t ++ names b
tD = matrix t
bD = matrix b
al = alphabet t
concat :: Alignment
-> Alignment
-> Alignment
concat l r
| nSequences l == nSequences r &&
al == alphabet r
= Alignment (names l) al (lD ||| rD)
| otherwise = error "concat: Multi sequence alignments do not have equal length."
where
lD = matrix l
rD = matrix r
al = alphabet l
concatAlignments :: [Alignment] -> Alignment
concatAlignments [] = error "concatAlignments: Nothing to concatenate."
concatAlignments [a] = a
concatAlignments as = foldl' concat (head as) (tail as)
filterColsWith :: (V.Vector Character -> Bool) -> Alignment -> Alignment
filterColsWith p a = a {matrix = m'}
where m' = M.fromColumns . filter p . M.toColumns $ matrix a
filterColsOnlyStd :: Alignment -> Alignment
filterColsOnlyStd a = filterColsWith (V.all $ A.isStd (alphabet a)) a
filterColsStd :: Double -> Alignment -> Alignment
filterColsStd prop a = filterColsWith
(\col -> prop * n <= fromIntegral (V.length (V.filter (A.isStd al) col))) a
where al = alphabet a
n = fromIntegral $ nSequences a
filterColsNoGaps :: Alignment -> Alignment
filterColsNoGaps a = filterColsWith (V.all $ not . A.isGap (alphabet a)) a
type FrequencyData = M.Matrix Double
toFrequencyData :: Alignment -> FrequencyData
toFrequencyData a = fMapColParChunk 100 (D.frequencyCharacters spec) (matrix a)
where spec = A.alphabetSpec (alphabet a)
distribution :: FrequencyData -> [Double]
distribution fd = map (/ fromIntegral nSites) $ V.toList $
foldl1 (V.zipWith (+)) (M.toColumns fd)
where nSites = M.cols fd
kEffEntropy :: FrequencyData -> [Double]
kEffEntropy fd = parMapChunk chunksize D.kEffEntropy (M.toColumns fd)
kEffHomoplasy :: FrequencyData -> [Double]
kEffHomoplasy fd = parMapChunk chunksize D.kEffHomoplasy (M.toColumns fd)
countIUPACChars :: Alignment -> Int
countIUPACChars a = V.length . V.filter (A.isIUPAC (alphabet a)) $ allChars
where allChars = M.flatten $ matrix a
countGaps :: Alignment -> Int
countGaps a = V.length . V.filter (A.isGap (alphabet a)) $ allChars
where allChars = M.flatten $ matrix a
countUnknowns :: Alignment -> Int
countUnknowns a = V.length . V.filter (A.isUnknown (alphabet a)) $ allChars
where allChars = M.flatten $ matrix a
subSample :: [Int] -> Alignment -> Alignment
subSample is a = a {matrix = m'}
where m' = subSampleMatrix is $ matrix a
randomSubSample :: PrimMonad m
=> Int -> Alignment -> Gen (PrimState m) -> m Alignment
randomSubSample n a g = do
let l = length a
is <- replicateM n $ uniformR (0, l-1) g
return $ subSample is a