{-# LANGUAGE BangPatterns #-}
module ELynx.Import.Sequence.Fasta
( fastaSequence,
fasta,
)
where
import Control.Applicative
import qualified Data.Attoparsec.ByteString as AS
import qualified Data.Attoparsec.ByteString.Char8 as AC
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.Set as S
import Data.Word8 (Word8)
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Sequence
isSpecial :: Char -> Bool
isSpecial w = w `elem` ['_', '|', '.', '-']
isHeader :: Char -> Bool
isHeader w = AC.isAlpha_ascii w || AC.isDigit w || isSpecial w
sequenceHeader :: AS.Parser (BL.ByteString, BL.ByteString)
sequenceHeader = do
_ <- AC.char '>'
n <- AC.takeWhile1 isHeader
_ <- AS.takeWhile AC.isHorizontalSpace
d <- AC.takeWhile isHeader
_ <- AC.endOfLine
return (BL.fromStrict n, BL.fromStrict d)
sequenceLine :: S.Set Word8 -> AS.Parser BL.ByteString
sequenceLine s = do
!xs <- AS.takeWhile1 (`S.member` s)
return (BL.fromStrict xs)
fastaSequence :: Alphabet -> AS.Parser Sequence
fastaSequence a = do
(n, d) <- sequenceHeader
let !alph = S.map toWord (A.all . alphabetSpec $ a)
lns <- sequenceLine alph `AS.sepBy1` AC.endOfLine
_ <- many AC.endOfLine
return $ Sequence n d a (fromByteString $ BL.concat lns)
fasta :: Alphabet -> AS.Parser [Sequence]
fasta a = some (fastaSequence a) <* AS.endOfInput