{-# LANGUAGE BangPatterns #-}
module ELynx.Import.Sequence.Fasta
( fastaSequence,
fasta,
)
where
import Control.Applicative
import qualified Data.Attoparsec.ByteString as AS
import qualified Data.Attoparsec.ByteString.Char8 as AC
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Set as S
import Data.Word8 (Word8)
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Sequence
isSpecial :: Char -> Bool
isSpecial :: Char -> Bool
isSpecial Char
w = Char
w Char -> [Char] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'_', Char
'|', Char
'.', Char
'-']
isHeader :: Char -> Bool
Char
w = Char -> Bool
AC.isAlpha_ascii Char
w Bool -> Bool -> Bool
|| Char -> Bool
AC.isDigit Char
w Bool -> Bool -> Bool
|| Char -> Bool
isSpecial Char
w
sequenceHeader :: AS.Parser (BL.ByteString, BL.ByteString)
= do
Char
_ <- Char -> Parser Char
AC.char Char
'>'
ByteString
n <- (Char -> Bool) -> Parser ByteString
AC.takeWhile1 Char -> Bool
isHeader
ByteString
_ <- (Word8 -> Bool) -> Parser ByteString
AS.takeWhile Word8 -> Bool
AC.isHorizontalSpace
ByteString
d <- (Char -> Bool) -> Parser ByteString
AC.takeWhile Char -> Bool
isHeader
()
_ <- Parser ()
AC.endOfLine
(ByteString, ByteString) -> Parser (ByteString, ByteString)
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> ByteString
BL.fromStrict ByteString
n, ByteString -> ByteString
BL.fromStrict ByteString
d)
sequenceLine :: S.Set Word8 -> AS.Parser BL.ByteString
sequenceLine :: Set Word8 -> Parser ByteString
sequenceLine Set Word8
s = do
!ByteString
xs <- (Word8 -> Bool) -> Parser ByteString
AS.takeWhile1 (Word8 -> Set Word8 -> Bool
forall a. Ord a => a -> Set a -> Bool
`S.member` Set Word8
s)
ByteString -> Parser ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> ByteString
BL.fromStrict ByteString
xs)
fastaSequence :: Alphabet -> AS.Parser Sequence
fastaSequence :: Alphabet -> Parser Sequence
fastaSequence Alphabet
a = do
(ByteString
n, ByteString
d) <- Parser (ByteString, ByteString)
sequenceHeader
let !alph :: Set Word8
alph = (Character -> Word8) -> Set Character -> Set Word8
forall b a. Ord b => (a -> b) -> Set a -> Set b
S.map Character -> Word8
toWord (AlphabetSpec -> Set Character
A.all (AlphabetSpec -> Set Character)
-> (Alphabet -> AlphabetSpec) -> Alphabet -> Set Character
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alphabet -> AlphabetSpec
alphabetSpec (Alphabet -> Set Character) -> Alphabet -> Set Character
forall a b. (a -> b) -> a -> b
$ Alphabet
a)
[ByteString]
lns <- Set Word8 -> Parser ByteString
sequenceLine Set Word8
alph Parser ByteString -> Parser () -> Parser ByteString [ByteString]
forall (f :: * -> *) a s. Alternative f => f a -> f s -> f [a]
`AS.sepBy1` Parser ()
AC.endOfLine
[()]
_ <- Parser () -> Parser ByteString [()]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Parser ()
AC.endOfLine
Sequence -> Parser Sequence
forall (m :: * -> *) a. Monad m => a -> m a
return (Sequence -> Parser Sequence) -> Sequence -> Parser Sequence
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString -> Alphabet -> Characters -> Sequence
Sequence ByteString
n ByteString
d Alphabet
a (ByteString -> Characters
fromByteString (ByteString -> Characters) -> ByteString -> Characters
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
BL.concat [ByteString]
lns)
fasta :: Alphabet -> AS.Parser [Sequence]
fasta :: Alphabet -> Parser [Sequence]
fasta Alphabet
a = Parser Sequence -> Parser [Sequence]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
some (Alphabet -> Parser Sequence
fastaSequence Alphabet
a) Parser [Sequence] -> Parser () -> Parser [Sequence]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
forall t. Chunk t => Parser t ()
AS.endOfInput