{-# LANGUAGE TemplateHaskell #-}
module ELynx.Data.Sequence.Sequence
(
SequenceName
, SequenceCharacters
, Sequence (Sequence)
, name
, alphabet
, characters
, toCharacters
, fromCharacters
, showSequence
, showSequenceList
, sequenceListHeader
, summarizeSequence
, summarizeSequenceList
, summarizeSequenceListBody
, lengthSequence
, equalLength
, longest
, trimSequence
, concatenate
, concatenateSeqs
, filterShorterThan
, filterLongerThan
) where
import Control.Lens
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as L
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import qualified Text.Printf as P
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Defaults
import ELynx.Tools.ByteString
import ELynx.Tools.Equality
type SequenceName = L.ByteString
type SequenceCharacters = V.Vector Character
data Sequence = Sequence { _name :: SequenceName
, _alphabet :: A.Alphabet
, _characters :: SequenceCharacters }
deriving (Eq)
makeLenses ''Sequence
toCharacters :: L.ByteString -> SequenceCharacters
toCharacters = V.fromList . map fromChar . L.unpack
fromCharacters :: SequenceCharacters -> L.ByteString
fromCharacters = L.pack . map toChar . V.toList
showInfo :: Sequence -> L.ByteString
showInfo s = L.unwords [ alignLeft defSequenceNameWidth (s^.name)
, alignRight defFieldWidth (L.pack $ show $ s^.alphabet)
, alignRight defFieldWidth (L.pack . show $ len)
, alignRight defFieldWidth (L.pack $ P.printf "%.3f" pGaps) ]
where len = lengthSequence s
nGaps = countGaps s
pGaps = fromIntegral nGaps / fromIntegral len :: Double
instance Show Sequence where
show s = L.unpack $ showSequence s
showSequence :: Sequence -> L.ByteString
showSequence s = L.unwords [showInfo s, fromCharacters $ s^.characters]
showSequenceList :: [Sequence] -> L.ByteString
showSequenceList = L.unlines . map showSequence
sequenceListHeader :: L.ByteString
sequenceListHeader = L.unwords [ alignLeft defSequenceNameWidth (L.pack "Name")
, alignRight defFieldWidth (L.pack "Code")
, alignRight defFieldWidth (L.pack "Length")
, alignRight defFieldWidth (L.pack "Gaps [%]")
, L.pack "Sequence" ]
summarizeSequence :: Sequence -> L.ByteString
summarizeSequence s = L.unwords [ showInfo s
, summarizeByteString defSequenceSummaryLength
(fromCharacters $ s^.characters) ]
summarizeSequenceList :: [Sequence] -> L.ByteString
summarizeSequenceList ss = summarizeSequenceListHeader ss <>
summarizeSequenceListBody (take defSequenceListSummaryNumber ss)
summarizeSequenceListHeader :: [Sequence] -> L.ByteString
summarizeSequenceListHeader ss = L.unlines $
reportIfSubsetIsShown ++
[ L.pack $ "For each sequence, the " ++ show defSequenceSummaryLength ++ " first bases are shown."
, L.pack $ "List contains " ++ show (length ss) ++ " sequences."
, L.pack ""
, sequenceListHeader ]
where l = length ss
s = show defSequenceListSummaryNumber ++ " out of " ++
show (length ss) ++ " sequences are shown."
reportIfSubsetIsShown
| l > defSequenceListSummaryNumber = [L.pack s]
| otherwise = []
summarizeSequenceListBody :: [Sequence] -> L.ByteString
summarizeSequenceListBody ss = L.unlines (map summarizeSequence ss `using` parListChunk 5 rdeepseq)
lengthSequence :: Sequence -> Int
lengthSequence s = fromIntegral $ V.length $ s ^. characters
equalLength :: [Sequence] -> Bool
equalLength = allEqual . map lengthSequence
longest :: [Sequence] -> Sequence
longest = maximumBy (comparing lengthSequence)
countGaps :: Sequence -> Int
countGaps s = V.length . V.filter (A.isGap (s^.alphabet)) $ s^.characters
trimSequence :: Int -> Sequence -> Sequence
trimSequence n = over characters (V.take $ fromIntegral n)
concatenate :: Sequence -> Sequence -> Sequence
concatenate (Sequence i c cs) (Sequence j k ks)
| i == j && c == k = Sequence i c (cs <> ks)
| otherwise = error $ "concatenate: Sequences do not have equal names: "
++ L.unpack i ++ ", " ++ L.unpack j ++ "."
concatenateSeqs :: [[Sequence]] -> [Sequence]
concatenateSeqs [] = error "concatenateSeqs: Nothing to concatenate."
concatenateSeqs [ss] = ss
concatenateSeqs sss = foldl1 (zipWith concatenate) sss
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan n = filter (\x -> lengthSequence x < n)
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan n = filter (\x -> lengthSequence x > n)