{-# LANGUAGE TemplateHaskell #-}
module ELynx.Data.Sequence.MultiSequenceAlignment
( MultiSequenceAlignment (MultiSequenceAlignment)
, msaLength
, msaNSequences
, fromSequenceList
, toSequenceList
, showMSA
, summarizeMSA
, msaJoin
, msaConcatenate
, msasConcatenate
, filterColumnsOnlyStd
, filterColumnsStd
, filterColumnsNoGaps
, FrequencyData
, toFrequencyData
, kEffEntropy
, kEffHomoplasy
, countIUPACChars
, countGaps
, countUnknowns
, subSample
, randomSubSample
) where
import Control.Lens
import Control.Monad
import Control.Monad.Primitive
import qualified Data.ByteString.Lazy.Char8 as L
import Data.List
import qualified Data.Matrix.Unboxed as M
import qualified Data.Vector.Unboxed as V
import System.Random.MWC
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import qualified ELynx.Data.Alphabet.DistributionDiversity as D
import ELynx.Data.Sequence.Defaults
import qualified ELynx.Data.Sequence.Sequence as S
import ELynx.Tools.ByteString
import ELynx.Tools.Concurrent
import ELynx.Tools.Equality
import ELynx.Tools.Matrix
data MultiSequenceAlignment = MultiSequenceAlignment
{ _names :: [S.SequenceName]
, _alphabet :: A.Alphabet
, _matrix :: M.Matrix Character
}
deriving (Read, Show, Eq)
makeLenses ''MultiSequenceAlignment
msaLength :: MultiSequenceAlignment -> Int
msaLength = M.cols . view matrix
msaNSequences :: MultiSequenceAlignment -> Int
msaNSequences = M.rows . view matrix
fromSequenceList :: [S.Sequence] -> Either String MultiSequenceAlignment
fromSequenceList ss
| S.equalLength ss && allEqual (map (view S.alphabet) ss) =
Right $ MultiSequenceAlignment ns a d
| S.equalLength ss =
Left "Sequences do not have equal codes."
| otherwise =
Left "Sequences do not have equal lengths."
where
ns = map (view S.name) ss
a = head ss ^. S.alphabet
bss = map (view S.characters) ss
d = M.fromRows bss
toSequenceList :: MultiSequenceAlignment -> [S.Sequence]
toSequenceList (MultiSequenceAlignment ns a d) = zipWith (\n r -> S.Sequence n a r) ns rows
where
rows = M.toRows d
msaHeader :: L.ByteString
msaHeader = L.unwords [ alignLeft defSequenceNameWidth (L.pack "Name")
, L.pack "Sequence" ]
showSequence :: MultiSequenceAlignment -> Int -> L.ByteString
showSequence m i =
L.unwords [ alignLeft defSequenceNameWidth $ (m ^. names) !! i
, S.fromCharacters $ M.takeRow (m ^. matrix) i ]
summarizeSequence :: MultiSequenceAlignment -> Int -> L.ByteString
summarizeSequence m i =
L.unwords [ alignLeft defSequenceNameWidth $ (m ^. names) !! i
, summarizeByteString defSequenceSummaryLength $
S.fromCharacters $ M.takeRow (m ^. matrix) i ]
showMSA :: MultiSequenceAlignment -> L.ByteString
showMSA msa = L.unlines $ msaHeader :
map (showSequence msa) [0 .. (msaNSequences msa - 1)]
summarizeMSAHeader :: MultiSequenceAlignment -> L.ByteString
summarizeMSAHeader msa = L.unlines $
[ L.pack "Multi sequence alignment."
, L.pack $ "Code: " ++ A.alphabetNameVerbose (msa^.alphabet) ++ "."
, L.pack $ "Length: " ++ show (msaLength msa) ++ "." ]
++ reportLengthSummary ++ reportNumberSummary
where reportLengthSummary =
[ L.pack $ "For each sequence, the "
++ show defSequenceSummaryLength ++ " first bases are shown."
| msaLength msa > defSequenceSummaryLength ]
reportNumberSummary =
[ L.pack $ show defSequenceListSummaryNumber ++ " out of " ++
show (msaNSequences msa) ++ " sequences are shown."
| msaNSequences msa > defSequenceListSummaryNumber ]
summarizeMSA :: MultiSequenceAlignment -> L.ByteString
summarizeMSA msa = L.unlines $ summarizeMSAHeader msa :
map (summarizeSequence msa) [0 .. n - 1]
where n = min (msaNSequences msa) defSequenceListSummaryNumber
msaJoin :: MultiSequenceAlignment
-> MultiSequenceAlignment
-> MultiSequenceAlignment
msaJoin t b
| msaLength t == msaLength b
&& t^.alphabet == b^.alphabet
= MultiSequenceAlignment ns a (tD === bD)
| otherwise = error "msaJoin: Multi sequence alignments do not have equal length."
where
ns = t^.names ++ b^.names
tD = t^.matrix
bD = t^.matrix
a = t^.alphabet
msaConcatenate :: MultiSequenceAlignment
-> MultiSequenceAlignment
-> MultiSequenceAlignment
msaConcatenate l r
| msaNSequences l == msaNSequences r
&& l^.alphabet == r^.alphabet
= MultiSequenceAlignment (l ^. names) a (lD ||| rD)
| otherwise = error "msaConcatenate: Multi sequence alignments do not have equal length."
where
lD = l^.matrix
rD = r^.matrix
a = l^.alphabet
msasConcatenate :: [MultiSequenceAlignment] -> MultiSequenceAlignment
msasConcatenate [] = error "msasConcatenate: Nothing to concatenate."
msasConcatenate [msa] = msa
msasConcatenate msas = foldl' msaConcatenate (head msas) (tail msas)
filterColumnsWith :: (V.Vector Character -> Bool) -> MultiSequenceAlignment -> MultiSequenceAlignment
filterColumnsWith p = over matrix (M.fromColumns . filter p . M.toColumns)
filterColumnsOnlyStd :: MultiSequenceAlignment -> MultiSequenceAlignment
filterColumnsOnlyStd msa = filterColumnsWith (V.all $ A.isStd (msa^.alphabet)) msa
filterColumnsStd :: Double -> MultiSequenceAlignment -> MultiSequenceAlignment
filterColumnsStd prop msa = filterColumnsWith
(\col -> prop * nSeqs <= fromIntegral (V.length (V.filter (A.isStd a) col)))
msa
where a = msa^.alphabet
nSeqs = fromIntegral $ msaNSequences msa
filterColumnsNoGaps :: MultiSequenceAlignment -> MultiSequenceAlignment
filterColumnsNoGaps msa = filterColumnsWith (V.all $ not . A.isGap (msa^.alphabet)) msa
type FrequencyData = M.Matrix Double
toFrequencyData :: MultiSequenceAlignment -> FrequencyData
toFrequencyData msa = fMapColParChunk 100 (D.frequencyCharacters spec) (msa^.matrix)
where spec = A.alphabetSpec (msa^.alphabet)
kEffEntropy :: FrequencyData -> [Double]
kEffEntropy fd = parMapChunk 500 D.kEffEntropy (M.toColumns fd)
kEffHomoplasy :: FrequencyData -> [Double]
kEffHomoplasy fd = parMapChunk 500 D.kEffHomoplasy (M.toColumns fd)
countIUPACChars :: MultiSequenceAlignment -> Int
countIUPACChars msa = V.length . V.filter (A.isIUPAC (msa^.alphabet)) $ allChars
where allChars = M.flatten $ msa^.matrix
countGaps :: MultiSequenceAlignment -> Int
countGaps msa = V.length . V.filter (A.isGap (msa^.alphabet)) $ allChars
where allChars = M.flatten $ msa^.matrix
countUnknowns :: MultiSequenceAlignment -> Int
countUnknowns msa = V.length . V.filter (A.isUnknown (msa^.alphabet)) $ allChars
where allChars = M.flatten $ msa^.matrix
subSample :: [Int] -> MultiSequenceAlignment -> MultiSequenceAlignment
subSample is = over matrix (subSampleMatrix is)
randomSubSample :: PrimMonad m
=> Int -> MultiSequenceAlignment -> Gen (PrimState m) -> m MultiSequenceAlignment
randomSubSample n msa g = do
let l = msaLength msa
is <- replicateM n $ uniformR (0, l-1) g
return $ subSample is msa