{-# LANGUAGE OverloadedStrings #-}

-- |
-- Module      :  ELynx.Data.Sequence
-- Description :  Hereditary sequences
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Thu Oct  4 18:54:51 2018.
--
-- This module is to be imported qualified.
module ELynx.Data.Sequence.Sequence
  ( -- * Types
    Name,
    Description,
    Characters,
    Sequence (..),

    -- * Input
    fromByteString,

    -- * Output
    toByteString,
    header,
    summarize,
    summarizeSequences,
    body,

    -- * Analysis
    length,
    equalLength,
    longest,

    -- * Manipulation
    trim,
    concat,
    concatSequences,

    -- * Filtering
    filterShorterThan,
    filterLongerThan,
    filterStandard,
  )
where

import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Defaults
import qualified Text.Printf as P
import Prelude hiding
  ( concat,
    length,
  )
import qualified Prelude as Pr
  ( length,
  )

-- | For now, 'Name's are just 'BL.ByteString's.
type Name = BL.ByteString

-- | The description of a sequence.
type Description = BL.ByteString

-- | The vector of characters of a sequence.
type Characters = V.Vector Character

-- | Convert byte string to sequence characters.
fromByteString :: BL.ByteString -> Characters
fromByteString :: ByteString -> Characters
fromByteString = [Character] -> Characters
forall a. Unbox a => [a] -> Vector a
V.fromList ([Character] -> Characters)
-> (ByteString -> [Character]) -> ByteString -> Characters
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Character) -> [Char] -> [Character]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Character
fromChar ([Char] -> [Character])
-> (ByteString -> [Char]) -> ByteString -> [Character]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Char]
BL.unpack

-- | Convert sequence characters to byte string.
toByteString :: Characters -> BL.ByteString
toByteString :: Characters -> ByteString
toByteString = [Char] -> ByteString
BL.pack ([Char] -> ByteString)
-> (Characters -> [Char]) -> Characters -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Char) -> [Character] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map Character -> Char
toChar ([Character] -> [Char])
-> (Characters -> [Character]) -> Characters -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Characters -> [Character]
forall a. Unbox a => Vector a -> [a]
V.toList

-- | Sequences have a name, a possibly empty description, a code and hopefully a
-- lot of data.
data Sequence = Sequence
  { Sequence -> ByteString
name :: Name,
    Sequence -> ByteString
description :: Description,
    Sequence -> Alphabet
alphabet :: A.Alphabet,
    Sequence -> Characters
characters :: Characters
  }
  deriving (Int -> Sequence -> ShowS
[Sequence] -> ShowS
Sequence -> [Char]
(Int -> Sequence -> ShowS)
-> (Sequence -> [Char]) -> ([Sequence] -> ShowS) -> Show Sequence
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Sequence] -> ShowS
$cshowList :: [Sequence] -> ShowS
show :: Sequence -> [Char]
$cshow :: Sequence -> [Char]
showsPrec :: Int -> Sequence -> ShowS
$cshowsPrec :: Int -> Sequence -> ShowS
Show, Sequence -> Sequence -> Bool
(Sequence -> Sequence -> Bool)
-> (Sequence -> Sequence -> Bool) -> Eq Sequence
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Sequence -> Sequence -> Bool
$c/= :: Sequence -> Sequence -> Bool
== :: Sequence -> Sequence -> Bool
$c== :: Sequence -> Sequence -> Bool
Eq)

alignRight :: Int -> BL.ByteString -> BL.ByteString
alignRight :: Int -> ByteString -> ByteString
alignRight Int
n ByteString
s =
  Int64 -> Char -> ByteString
BL.replicate (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
l) Char
' ' ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> Int64 -> ByteString -> ByteString
BL.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) ByteString
s
  where
    l :: Int64
l = ByteString -> Int64
BL.length ByteString
s

alignLeft :: Int -> BL.ByteString -> BL.ByteString
alignLeft :: Int -> ByteString -> ByteString
alignLeft Int
n ByteString
s =
  Int64 -> ByteString -> ByteString
BL.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) ByteString
s ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> Int64 -> Char -> ByteString
BL.replicate (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
l) Char
' '
  where
    l :: Int64
l = ByteString -> Int64
BL.length ByteString
s

getInfo :: Sequence -> BL.ByteString
getInfo :: Sequence -> ByteString
getInfo Sequence
s =
  [ByteString] -> ByteString
BL.unwords
    [ Int -> ByteString -> ByteString
alignLeft Int
nameWidth (Sequence -> ByteString
name Sequence
s),
      Int -> ByteString -> ByteString
alignRight Int
fieldWidth ([Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$ Alphabet -> [Char]
forall a. Show a => a -> [Char]
show (Alphabet -> [Char]) -> Alphabet -> [Char]
forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s),
      Int -> ByteString -> ByteString
alignRight Int
fieldWidth ([Char] -> ByteString
BL.pack ([Char] -> ByteString) -> (Int -> [Char]) -> Int -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Char]
forall a. Show a => a -> [Char]
show (Int -> ByteString) -> Int -> ByteString
forall a b. (a -> b) -> a -> b
$ Int
len),
      Int -> ByteString -> ByteString
alignRight Int
fieldWidth ([Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$ [Char] -> Double -> [Char]
forall r. PrintfType r => [Char] -> r
P.printf [Char]
"%2.2f" Double
pGaps)
    ]
  where
    len :: Int
len = Sequence -> Int
length Sequence
s
    nGaps :: Int
nGaps = Sequence -> Int
countGaps Sequence
s
    pGaps :: Double
pGaps = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nGaps Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len :: Double

-- If a string is longer than a given value, trim it and add some dots.
summarizeByteString :: Int -> BL.ByteString -> BL.ByteString
summarizeByteString :: Int -> ByteString -> ByteString
summarizeByteString Int
l ByteString
s
  | ByteString -> Int64
BL.length ByteString
s Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l = Int64 -> ByteString -> ByteString
BL.take (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l) ByteString
s ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> [Char] -> ByteString
BL.pack [Char]
"..."
  | Bool
otherwise = ByteString
s

-- | Trim and show a 'Sequence'.
summarize :: Sequence -> BL.ByteString
summarize :: Sequence -> ByteString
summarize Sequence
s =
  [ByteString] -> ByteString
BL.unwords
    [Sequence -> ByteString
getInfo Sequence
s, Int -> ByteString -> ByteString
summarizeByteString Int
summaryLength (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ Characters -> ByteString
toByteString (Sequence -> Characters
characters Sequence
s)]

-- | Trim and show a list of 'Sequence's.
summarizeSequences :: [Sequence] -> BL.ByteString
summarizeSequences :: [Sequence] -> ByteString
summarizeSequences [Sequence]
ss = [Sequence] -> ByteString
header [Sequence]
ss ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> [Sequence] -> ByteString
body (Int -> [Sequence] -> [Sequence]
forall a. Int -> [a] -> [a]
take Int
summaryNSequences [Sequence]
ss)

-- | Header printed before 'Sequence' list.
tableHeader :: BL.ByteString
tableHeader :: ByteString
tableHeader =
  [ByteString] -> ByteString
BL.unwords
    [ Int -> ByteString -> ByteString
alignLeft Int
nameWidth ByteString
"Name",
      Int -> ByteString -> ByteString
alignRight Int
fieldWidth ByteString
"Code",
      Int -> ByteString -> ByteString
alignRight Int
fieldWidth ByteString
"Length",
      Int -> ByteString -> ByteString
alignRight Int
fieldWidth ByteString
"Gaps [%]",
      ByteString
"Sequence"
    ]

-- | A short description of the sequence.
header :: [Sequence] -> BL.ByteString
header :: [Sequence] -> ByteString
header [Sequence]
ss =
  [ByteString] -> ByteString
BL.unlines ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$
    [ByteString]
reportIfSubsetIsShown
      [ByteString] -> [ByteString] -> [ByteString]
forall a. [a] -> [a] -> [a]
++ [ [Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$
             [Char]
"For each sequence, the "
               [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
summaryLength
               [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" first bases are shown.",
           [Char] -> ByteString
BL.pack ([Char] -> ByteString) -> [Char] -> ByteString
forall a b. (a -> b) -> a -> b
$ [Char]
"List contains " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show ([Sequence] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss) [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" sequences.",
           ByteString
"",
           ByteString
tableHeader
         ]
  where
    l :: Int
l = [Sequence] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss
    s :: [Char]
s =
      Int -> [Char]
forall a. Show a => a -> [Char]
show Int
summaryNSequences
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" out of "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show ([Sequence] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss)
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" sequences are shown."
    reportIfSubsetIsShown :: [ByteString]
reportIfSubsetIsShown
      | Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
summaryNSequences = [[Char] -> ByteString
BL.pack [Char]
s]
      | Bool
otherwise = []

-- | Trim and show a list of 'Sequence's.
body :: [Sequence] -> BL.ByteString
body :: [Sequence] -> ByteString
body [Sequence]
ss = [ByteString] -> ByteString
BL.unlines ((Sequence -> ByteString) -> [Sequence] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> ByteString
summarize [Sequence]
ss [ByteString] -> Strategy [ByteString] -> [ByteString]
forall a. a -> Strategy a -> a
`using` Int -> Strategy ByteString -> Strategy [ByteString]
forall a. Int -> Strategy a -> Strategy [a]
parListChunk Int
5 Strategy ByteString
forall a. NFData a => Strategy a
rdeepseq)

-- | Calculate length of 'Sequence'.
length :: Sequence -> Int
length :: Sequence -> Int
length = Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> (Sequence -> Int) -> Sequence -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int) -> (Sequence -> Characters) -> Sequence -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Sequence -> Characters
characters

-- | Check if all 'Sequence's have equal length.
equalLength :: [Sequence] -> Bool
equalLength :: [Sequence] -> Bool
equalLength = [Int] -> Bool
forall a. Eq a => [a] -> Bool
allEqual ([Int] -> Bool) -> ([Sequence] -> [Int]) -> [Sequence] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Sequence -> Int) -> [Sequence] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Int
length
  where
    allEqual :: [a] -> Bool
allEqual [] = Bool
True
    allEqual [a]
xs = (a -> Bool) -> [a] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== [a] -> a
forall a. [a] -> a
head [a]
xs) ([a] -> Bool) -> [a] -> Bool
forall a b. (a -> b) -> a -> b
$ [a] -> [a]
forall a. [a] -> [a]
tail [a]
xs

-- | Find the longest 'Sequence' in a list.
longest :: [Sequence] -> Sequence
longest :: [Sequence] -> Sequence
longest = (Sequence -> Sequence -> Ordering) -> [Sequence] -> Sequence
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy ((Sequence -> Int) -> Sequence -> Sequence -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing Sequence -> Int
length)

-- | Count number of gaps or unknown characters in sequence.
countGaps :: Sequence -> Int
countGaps :: Sequence -> Int
countGaps Sequence
s = Characters -> Int
forall a. Unbox a => Vector a -> Int
V.length (Characters -> Int)
-> (Characters -> Characters) -> Characters -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Character -> Bool) -> Characters -> Characters
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isGap (Alphabet -> Character -> Bool) -> Alphabet -> Character -> Bool
forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s) (Characters -> Int) -> Characters -> Int
forall a b. (a -> b) -> a -> b
$ Sequence -> Characters
characters Sequence
s

-- | Trim to given length.
trim :: Int -> Sequence -> Sequence
trim :: Int -> Sequence -> Sequence
trim Int
n (Sequence ByteString
nm ByteString
d Alphabet
a Characters
cs) = ByteString -> ByteString -> Alphabet -> Characters -> Sequence
Sequence ByteString
nm ByteString
d Alphabet
a (Int -> Characters -> Characters
forall a. Unbox a => Int -> Vector a -> Vector a
V.take (Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Characters
cs)

-- | Concatenate two sequences. 'Name's have to match.
concat :: Sequence -> Sequence -> Sequence
concat :: Sequence -> Sequence -> Sequence
concat (Sequence ByteString
i ByteString
d Alphabet
c Characters
cs) (Sequence ByteString
j ByteString
f Alphabet
k Characters
ks)
  | ByteString
i ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= ByteString
j =
    [Char] -> Sequence
forall a. HasCallStack => [Char] -> a
error ([Char] -> Sequence) -> [Char] -> Sequence
forall a b. (a -> b) -> a -> b
$
      [Char]
"concatenate: Sequences do not have equal names: "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
i
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
", "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
j
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"."
  | ByteString
d ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= ByteString
f =
    [Char] -> Sequence
forall a. HasCallStack => [Char] -> a
error ([Char] -> Sequence) -> [Char] -> Sequence
forall a b. (a -> b) -> a -> b
$
      [Char]
"concatenate: Sequences do not have equal descriptions: "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
d
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
", "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ ByteString -> [Char]
BL.unpack ByteString
f
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"."
  | Alphabet
c Alphabet -> Alphabet -> Bool
forall a. Eq a => a -> a -> Bool
/= Alphabet
k =
    [Char] -> Sequence
forall a. HasCallStack => [Char] -> a
error ([Char] -> Sequence) -> [Char] -> Sequence
forall a b. (a -> b) -> a -> b
$
      [Char]
"concatenate: Sequences do not have equal alphabets: "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Alphabet -> [Char]
forall a. Show a => a -> [Char]
show Alphabet
c
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
", "
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Alphabet -> [Char]
forall a. Show a => a -> [Char]
show Alphabet
k
        [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
"."
  | Bool
otherwise =
    ByteString -> ByteString -> Alphabet -> Characters -> Sequence
Sequence ByteString
i ByteString
d Alphabet
c (Characters
cs Characters -> Characters -> Characters
forall a. Semigroup a => a -> a -> a
<> Characters
ks)

-- | Concatenate a list of sequences, see 'concat'.
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences [] = [Char] -> [Sequence]
forall a. HasCallStack => [Char] -> a
error [Char]
"concatenateSequences: Nothing to concatenate."
concatSequences [[Sequence]
ss] = [Sequence]
ss
concatSequences [[Sequence]]
sss = ([Sequence] -> [Sequence] -> [Sequence])
-> [[Sequence]] -> [Sequence]
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1 ((Sequence -> Sequence -> Sequence)
-> [Sequence] -> [Sequence] -> [Sequence]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Sequence -> Sequence -> Sequence
concat) [[Sequence]]
sss

-- | Only take 'Sequence's that are shorter than a given number.
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan Int
n = (Sequence -> Bool) -> [Sequence] -> [Sequence]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
n)

-- | Only take 'Sequence's that are longer than a given number.
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan Int
n = (Sequence -> Bool) -> [Sequence] -> [Sequence]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
n)

-- | Only take 'Sequence's that contain at least on non-IUPAC character.
filterStandard :: [Sequence] -> [Sequence]
filterStandard :: [Sequence] -> [Sequence]
filterStandard = (Sequence -> Bool) -> [Sequence] -> [Sequence]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
s -> Alphabet -> Sequence -> Bool
anyStandard (Sequence -> Alphabet
alphabet Sequence
s) Sequence
s)

-- Are all characters IUPAC characters?
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard :: Alphabet -> Sequence -> Bool
anyStandard Alphabet
a Sequence
s = (Character -> Bool) -> Characters -> Bool
forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.any (Alphabet -> Character -> Bool
A.isStd Alphabet
a) Characters
cs where cs :: Characters
cs = Sequence -> Characters
characters Sequence
s