{-# LANGUAGE OverloadedStrings #-}
module ELynx.Data.Sequence.Sequence
(
Name,
Description,
Characters,
Sequence (..),
fromByteString,
toByteString,
header,
summarize,
summarizeSequences,
body,
length,
equalLength,
longest,
trim,
concat,
concatSequences,
filterShorterThan,
filterLongerThan,
filterStandard,
)
where
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Defaults
import qualified Text.Printf as P
import Prelude hiding
( concat,
length,
)
import qualified Prelude as Pr
( length,
)
type Name = BL.ByteString
type Description = BL.ByteString
type Characters = V.Vector Character
fromByteString :: BL.ByteString -> Characters
fromByteString :: ByteString -> Characters
fromByteString = [Character] -> Characters
forall a. Unbox a => [a] -> Vector a
V.fromList ([Character] -> Characters)
-> (ByteString -> [Character]) -> ByteString -> Characters
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Character) -> [Char] -> [Character]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Character
fromChar ([Char] -> [Character])
-> (ByteString -> [Char]) -> ByteString -> [Character]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Char]
BL.unpack
toByteString :: Characters -> BL.ByteString
toByteString :: Characters -> ByteString
toByteString = [Char] -> ByteString
BL.pack ([Char] -> ByteString)
-> (Characters -> [Char]) -> Characters -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Char) -> [Character] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map Character -> Char
toChar ([Character] -> [Char])
-> (Characters -> [Character]) -> Characters -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Characters -> [Character]
forall a. Unbox a => Vector a -> [a]
V.toList
data Sequence = Sequence
{ Sequence -> ByteString
name :: Name,
Sequence -> ByteString
description :: Description,
Sequence -> Alphabet
alphabet :: A.Alphabet,
Sequence -> Characters
characters :: Characters
}
deriving (Int -> Sequence -> ShowS
[Sequence] -> ShowS
Sequence -> [Char]
(Int -> Sequence -> ShowS)
-> (Sequence -> [Char]) -> ([Sequence] -> ShowS) -> Show Sequence
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Sequence] -> ShowS
$cshowList :: [Sequence] -> ShowS
show :: Sequence -> [Char]
$cshow :: Sequence -> [Char]
showsPrec :: Int -> Sequence -> ShowS
$cshowsPrec :: Int -> Sequence -> ShowS
Show, Sequence -> Sequence -> Bool
(Sequence -> Sequence -> Bool)
-> (Sequence -> Sequence -> Bool) -> Eq Sequence
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Sequence -> Sequence -> Bool
$c/= :: Sequence -> Sequence -> Bool
== :: Sequence -> Sequence -> Bool
$c== :: Sequence -> Sequence -> Bool
Eq)
alignRight :: Int -> BL.ByteString -> BL.ByteString
alignRight :: Int -> ByteString -> ByteString
alignRight Int
n ByteString
s =
Int64 -> Char -> ByteString
BL.replicate (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
l) Char
' ' ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> Int64 -> ByteString -> ByteString
BL.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) ByteString
s
where
l :: Int64
l = ByteString -> Int64
BL.length ByteString
s
alignLeft :: Int -> BL.ByteString -> BL.ByteString
alignLeft :: Int -> ByteString -> ByteString
alignLeft Int
n ByteString
s =
Int64 -> ByteString -> ByteString
BL.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) ByteString
s ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> Int64 -> Char -> ByteString
BL.replicate (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
l) Char
' '
where
l :: Int64
l = ByteString -> Int64
BL.length ByteString
s
getInfo :: Sequence -> BL.ByteString
getInfo :: Sequence -> ByteString
getInfo Sequence
s =
[ByteString] -> ByteString
BL.unwords
[ Int -> ByteString -> ByteString
alignLeft Int
nameWidth (Sequence -> ByteString
name Sequence
s),
Int -> ByteString -> ByteString
alignRight Int
fieldWidth ([Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$ Alphabet -> [Char]
forall a. Show a => a -> [Char]
show (Alphabet -> [Char]) -> Alphabet -> [Char]
forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s),
Int -> ByteString -> ByteString
alignRight Int
fieldWidth ([Char] -> ByteString
BL.pack ([Char] -> ByteString) -> (Int -> [Char]) -> Int -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
forall a. Show a => a -> [Char]
show (Int -> ByteString) -> Int -> ByteString
forall a b. (a -> b) -> a -> b
$ Int
len),
Int -> ByteString -> ByteString
alignRight Int
fieldWidth ([Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$ [Char] -> Double -> [Char]
forall r. PrintfType r => [Char] -> r
P.printf [Char]
"%2.2f" Double
pGaps)
]
where
len :: Int
len = Sequence -> Int
length Sequence
s
nGaps :: Int
nGaps = Sequence -> Int
countGaps Sequence
s
pGaps :: Double
pGaps = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nGaps Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len :: Double
summarizeByteString :: Int -> BL.ByteString -> BL.ByteString
summarizeByteString :: Int -> ByteString -> ByteString
summarizeByteString Int
l ByteString
s
| ByteString -> Int64
BL.length ByteString
s Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l = Int64 -> ByteString -> ByteString
BL.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l) ByteString
s ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> [Char] -> ByteString
BL.pack [Char]
"..."
| Bool
otherwise = ByteString
s
summarize :: Sequence -> BL.ByteString
summarize :: Sequence -> ByteString
summarize Sequence
s =
[ByteString] -> ByteString
BL.unwords
[Sequence -> ByteString
getInfo Sequence
s, Int -> ByteString -> ByteString
summarizeByteString Int
summaryLength (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ Characters -> ByteString
toByteString (Sequence -> Characters
characters Sequence
s)]
summarizeSequences :: [Sequence] -> BL.ByteString
summarizeSequences :: [Sequence] -> ByteString
summarizeSequences [Sequence]
ss = [Sequence] -> ByteString
header [Sequence]
ss ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> [Sequence] -> ByteString
body (Int -> [Sequence] -> [Sequence]
forall a. Int -> [a] -> [a]
take Int
summaryNSequences [Sequence]
ss)
tableHeader :: BL.ByteString
=
[ByteString] -> ByteString
BL.unwords
[ Int -> ByteString -> ByteString
alignLeft Int
nameWidth ByteString
"Name",
Int -> ByteString -> ByteString
alignRight Int
fieldWidth ByteString
"Code",
Int -> ByteString -> ByteString
alignRight Int
fieldWidth ByteString
"Length",
Int -> ByteString -> ByteString
alignRight Int
fieldWidth ByteString
"Gaps [%]",
ByteString
"Sequence"
]
header :: [Sequence] -> BL.ByteString
[Sequence]
ss =
[ByteString] -> ByteString
BL.unlines ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$
[ByteString]
reportIfSubsetIsShown
[ByteString] -> [ByteString] -> [ByteString]
forall a. [a] -> [a] -> [a]
++ [ [Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$
[Char]
"For each sequence, the "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
summaryLength
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" first bases are shown.",
[Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$ [Char]
"List contains " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show ([Sequence] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss) [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" sequences.",
ByteString
"",
ByteString
tableHeader
]
where
l :: Int
l = [Sequence] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss
s :: [Char]
s =
Int -> [Char]
forall a. Show a => a -> [Char]
show Int
summaryNSequences
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" out of "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show ([Sequence] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss)
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" sequences are shown."
reportIfSubsetIsShown :: [ByteString]
reportIfSubsetIsShown
| Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
summaryNSequences = [[Char] -> ByteString
BL.pack [Char]
s]
| Bool
otherwise = []
body :: [Sequence] -> BL.ByteString
body :: [Sequence] -> ByteString
body [Sequence]
ss = [ByteString] -> ByteString
BL.unlines ((Sequence -> ByteString) -> [Sequence] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> ByteString
summarize [Sequence]
ss [ByteString] -> Strategy [ByteString] -> [ByteString]
forall a. a -> Strategy a -> a
`using` Int -> Strategy ByteString -> Strategy [ByteString]
forall a. Int -> Strategy a -> Strategy [a]
parListChunk Int
5 Strategy ByteString
forall a. NFData a => Strategy a
rdeepseq)
length :: Sequence -> Int
length :: Sequence -> Int
length = Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> (Sequence -> Int) -> Sequence -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int) -> (Sequence -> Characters) -> Sequence -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Sequence -> Characters
characters
equalLength :: [Sequence] -> Bool
equalLength :: [Sequence] -> Bool
equalLength = [Int] -> Bool
forall a. Eq a => [a] -> Bool
allEqual ([Int] -> Bool) -> ([Sequence] -> [Int]) -> [Sequence] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Sequence -> Int) -> [Sequence] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Int
length
where
allEqual :: [a] -> Bool
allEqual [] = Bool
True
allEqual [a]
xs = (a -> Bool) -> [a] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== [a] -> a
forall a. [a] -> a
head [a]
xs) ([a] -> Bool) -> [a] -> Bool
forall a b. (a -> b) -> a -> b
$ [a] -> [a]
forall a. [a] -> [a]
tail [a]
xs
longest :: [Sequence] -> Sequence
longest :: [Sequence] -> Sequence
longest = (Sequence -> Sequence -> Ordering) -> [Sequence] -> Sequence
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy ((Sequence -> Int) -> Sequence -> Sequence -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing Sequence -> Int
length)
countGaps :: Sequence -> Int
countGaps :: Sequence -> Int
countGaps Sequence
s = Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int)
-> (Characters -> Characters) -> Characters -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Bool) -> Characters -> Characters
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isGap (Alphabet -> Character -> Bool) -> Alphabet -> Character -> Bool
forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s) (Characters -> Int) -> Characters -> Int
forall a b. (a -> b) -> a -> b
$ Sequence -> Characters
characters Sequence
s
trim :: Int -> Sequence -> Sequence
trim :: Int -> Sequence -> Sequence
trim Int
n (Sequence ByteString
nm ByteString
d Alphabet
a Characters
cs) = ByteString -> ByteString -> Alphabet -> Characters -> Sequence
Sequence ByteString
nm ByteString
d Alphabet
a (Int -> Characters -> Characters
forall a. Unbox a => Int -> Vector a -> Vector a
V.take (Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Characters
cs)
concat :: Sequence -> Sequence -> Sequence
concat :: Sequence -> Sequence -> Sequence
concat (Sequence ByteString
i ByteString
d Alphabet
c Characters
cs) (Sequence ByteString
j ByteString
f Alphabet
k Characters
ks)
| ByteString
i ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= ByteString
j =
[Char] -> Sequence
forall a. HasCallStack => [Char] -> a
error ([Char] -> Sequence) -> [Char] -> Sequence
forall a b. (a -> b) -> a -> b
$
[Char]
"concatenate: Sequences do not have equal names: "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
i
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
", "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
j
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"."
| ByteString
d ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= ByteString
f =
[Char] -> Sequence
forall a. HasCallStack => [Char] -> a
error ([Char] -> Sequence) -> [Char] -> Sequence
forall a b. (a -> b) -> a -> b
$
[Char]
"concatenate: Sequences do not have equal descriptions: "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
d
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
", "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
f
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"."
| Alphabet
c Alphabet -> Alphabet -> Bool
forall a. Eq a => a -> a -> Bool
/= Alphabet
k =
[Char] -> Sequence
forall a. HasCallStack => [Char] -> a
error ([Char] -> Sequence) -> [Char] -> Sequence
forall a b. (a -> b) -> a -> b
$
[Char]
"concatenate: Sequences do not have equal alphabets: "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Alphabet -> [Char]
forall a. Show a => a -> [Char]
show Alphabet
c
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
", "
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Alphabet -> [Char]
forall a. Show a => a -> [Char]
show Alphabet
k
[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"."
| Bool
otherwise =
ByteString -> ByteString -> Alphabet -> Characters -> Sequence
Sequence ByteString
i ByteString
d Alphabet
c (Characters
cs Characters -> Characters -> Characters
forall a. Semigroup a => a -> a -> a
<> Characters
ks)
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences [] = [Char] -> [Sequence]
forall a. HasCallStack => [Char] -> a
error [Char]
"concatenateSequences: Nothing to concatenate."
concatSequences [[Sequence]
ss] = [Sequence]
ss
concatSequences [[Sequence]]
sss = ([Sequence] -> [Sequence] -> [Sequence])
-> [[Sequence]] -> [Sequence]
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1 ((Sequence -> Sequence -> Sequence)
-> [Sequence] -> [Sequence] -> [Sequence]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Sequence -> Sequence -> Sequence
concat) [[Sequence]]
sss
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan Int
n = (Sequence -> Bool) -> [Sequence] -> [Sequence]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
n)
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan Int
n = (Sequence -> Bool) -> [Sequence] -> [Sequence]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
n)
filterStandard :: [Sequence] -> [Sequence]
filterStandard :: [Sequence] -> [Sequence]
filterStandard = (Sequence -> Bool) -> [Sequence] -> [Sequence]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
s -> Alphabet -> Sequence -> Bool
anyStandard (Sequence -> Alphabet
alphabet Sequence
s) Sequence
s)
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard :: Alphabet -> Sequence -> Bool
anyStandard Alphabet
a Sequence
s = (Character -> Bool) -> Characters -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.any (Alphabet -> Character -> Bool
A.isStd Alphabet
a) Characters
cs where cs :: Characters
cs = Sequence -> Characters
characters Sequence
s