{-# LANGUAGE OverloadedStrings #-}
module ELynx.Data.Sequence.Sequence
(
Name
, Characters
, Sequence (..)
, fromByteString
, toByteString
, header
, summarize
, summarizeSequences
, body
, length
, equalLength
, longest
, trim
, concat
, concatSequences
, filterShorterThan
, filterLongerThan
, filterStandard
) where
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as L
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import Prelude hiding (concat, length)
import qualified Prelude as Pr (length)
import qualified Text.Printf as P
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Defaults
import ELynx.Tools.ByteString
import ELynx.Tools.Equality
type Name = L.ByteString
type Characters = V.Vector Character
fromByteString :: L.ByteString -> Characters
fromByteString = V.fromList . map fromChar . L.unpack
toByteString :: Characters -> L.ByteString
toByteString = L.pack . map toChar . V.toList
data Sequence = Sequence { name :: Name
, alphabet :: A.Alphabet
, characters :: Characters }
deriving (Show, Eq)
getInfo :: Sequence -> L.ByteString
getInfo s = L.unwords [ alignLeft nameWidth (name s)
, alignRight fieldWidth (L.pack $ show $ alphabet s)
, alignRight fieldWidth (L.pack . show $ len)
, alignRight fieldWidth (L.pack $ P.printf "%.3f" pGaps) ]
where len = length s
nGaps = countGaps s
pGaps = fromIntegral nGaps / fromIntegral len :: Double
summarize :: Sequence -> L.ByteString
summarize s = L.unwords [ getInfo s , summarizeByteString summaryLength $ toByteString (characters s) ]
summarizeSequences :: [Sequence] -> L.ByteString
summarizeSequences ss = header ss <> body (take summaryNSequences ss)
tableHeader :: L.ByteString
tableHeader = L.unwords [ alignLeft nameWidth "Name"
, alignRight fieldWidth "Code"
, alignRight fieldWidth "Length"
, alignRight fieldWidth "Gaps [%]"
, "Sequence" ]
header :: [Sequence] -> L.ByteString
header ss = L.unlines $
reportIfSubsetIsShown ++
[ L.pack $ "For each sequence, the " ++ show summaryLength ++ " first bases are shown."
, L.pack $ "List contains " ++ show (Pr.length ss) ++ " sequences."
, ""
, tableHeader ]
where l = Pr.length ss
s = show summaryNSequences ++ " out of " ++
show (Pr.length ss) ++ " sequences are shown."
reportIfSubsetIsShown
| l > summaryNSequences = [L.pack s]
| otherwise = []
body :: [Sequence] -> L.ByteString
body ss = L.unlines (map summarize ss `using` parListChunk 5 rdeepseq)
length :: Sequence -> Int
length = fromIntegral . V.length . characters
equalLength :: [Sequence] -> Bool
equalLength = allEqual . map length
longest :: [Sequence] -> Sequence
longest = maximumBy (comparing length)
countGaps :: Sequence -> Int
countGaps s = V.length . V.filter (A.isGap $ alphabet s) $ characters s
trim :: Int -> Sequence -> Sequence
trim n (Sequence nm a cs) = Sequence nm a (V.take (fromIntegral n) cs)
concat :: Sequence -> Sequence -> Sequence
concat (Sequence i c cs) (Sequence j k ks)
| i == j && c == k = Sequence i c (cs <> ks)
| otherwise = error $ "concatenate: Sequences do not have equal names: "
++ L.unpack i ++ ", " ++ L.unpack j ++ "."
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences [] = error "concatenateSequences: Nothing to concatenate."
concatSequences [ss] = ss
concatSequences sss = foldl1 (zipWith concat) sss
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan n = filter (\x -> length x < n)
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan n = filter (\x -> length x > n)
filterStandard :: [Sequence] -> [Sequence]
filterStandard = filter (\s -> anyStandard (alphabet s) s)
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard a s = V.any (A.isStd a) cs
where cs = characters s