{-# LANGUAGE BangPatterns #-}

-- |
-- Module      :  ELynx.Import.Sequence.Fasta
-- Description :  Import Fasta sequences
-- Copyright   :  (c) Dominik Schrempf 2021
-- License     :  GPL-3.0-or-later
--
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Parse FASTA files.
--
-- [NCBI file specifications](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp).
module ELynx.Import.Sequence.Fasta
  ( fastaSequence,
    fasta,
  )
where

import Control.Applicative
import qualified Data.Attoparsec.ByteString as AS
import qualified Data.Attoparsec.ByteString.Char8 as AC
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Set as S
import Data.Word8 (Word8)
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Sequence

isSpecial :: Char -> Bool
isSpecial :: Char -> Bool
isSpecial Char
w = Char
w Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'_', Char
'|', Char
'.', Char
'-']

isHeader :: Char -> Bool
isHeader :: Char -> Bool
isHeader Char
w = Char -> Bool
AC.isAlpha_ascii Char
w Bool -> Bool -> Bool
|| Char -> Bool
AC.isDigit Char
w Bool -> Bool -> Bool
|| Char -> Bool
isSpecial Char
w

sequenceHeader :: AS.Parser (BL.ByteString, BL.ByteString)
sequenceHeader :: Parser (ByteString, ByteString)
sequenceHeader = do
  Char
_ <- Char -> Parser Char
AC.char Char
'>'
  ByteString
n <- (Char -> Bool) -> Parser ByteString
AC.takeWhile1 Char -> Bool
isHeader
  ByteString
_ <- (Word8 -> Bool) -> Parser ByteString
AS.takeWhile Word8 -> Bool
AC.isHorizontalSpace
  ByteString
d <- (Char -> Bool) -> Parser ByteString
AC.takeWhile Char -> Bool
isHeader
  ()
_ <- Parser ()
AC.endOfLine
  (ByteString, ByteString) -> Parser (ByteString, ByteString)
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> ByteString
BL.fromStrict ByteString
n, ByteString -> ByteString
BL.fromStrict ByteString
d)

-- It is a little faster to directly pass the set of allowed characters. Then,
-- this set only has to be calculcated once per sequence in 'fastaSequence'.
sequenceLine :: S.Set Word8 -> AS.Parser BL.ByteString
sequenceLine :: Set Word8 -> Parser ByteString
sequenceLine Set Word8
s = do
  -- XXX: Will fail for non-capital letters.
  !ByteString
xs <- (Word8 -> Bool) -> Parser ByteString
AS.takeWhile1 (Word8 -> Set Word8 -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` Set Word8
s)
  ByteString -> Parser ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> ByteString
BL.fromStrict ByteString
xs)

-- XXX: If sequences are parsed line by line, the lines have to be copied when
-- forming the complete sequence. This is not memory efficient.

-- | Parse a sequence of characters.
fastaSequence :: Alphabet -> AS.Parser Sequence
fastaSequence :: Alphabet -> Parser Sequence
fastaSequence Alphabet
a = do
  (ByteString
n, ByteString
d) <- Parser (ByteString, ByteString)
sequenceHeader
  let !alph :: Set Word8
alph = (Character -> Word8) -> Set Character -> Set Word8
forall b a. Ord b => (a -> b) -> Set a -> Set b
S.map Character -> Word8
toWord (AlphabetSpec -> Set Character
A.all (AlphabetSpec -> Set Character)
-> (Alphabet -> AlphabetSpec) -> Alphabet -> Set Character
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alphabet -> AlphabetSpec
alphabetSpec (Alphabet -> Set Character) -> Alphabet -> Set Character
forall a b. (a -> b) -> a -> b
$ Alphabet
a)
  [ByteString]
lns <- Set Word8 -> Parser ByteString
sequenceLine Set Word8
alph Parser ByteString -> Parser () -> Parser ByteString [ByteString]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`AS.sepBy1` Parser ()
AC.endOfLine
  [()]
_ <- Parser () -> Parser ByteString [()]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Parser ()
AC.endOfLine
  Sequence -> Parser Sequence
forall (m :: * -> *) a. Monad m => a -> m a
return (Sequence -> Parser Sequence) -> Sequence -> Parser Sequence
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString -> Alphabet -> Characters -> Sequence
Sequence ByteString
n ByteString
d Alphabet
a (ByteString -> Characters
fromByteString (ByteString -> Characters) -> ByteString -> Characters
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
BL.concat [ByteString]
lns)

-- | Parse a Fasta file with given 'Alphabet'.
fasta :: Alphabet -> AS.Parser [Sequence]
fasta :: Alphabet -> Parser [Sequence]
fasta Alphabet
a = Parser Sequence -> Parser [Sequence]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
some (Alphabet -> Parser Sequence
fastaSequence Alphabet
a) Parser [Sequence] -> Parser () -> Parser [Sequence]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
forall t. Chunk t => Parser t ()
AS.endOfInput