-- | Parser for @FastA/FastQ@, 'Iteratee' style, based on -- "Data.Attoparsec", and written such that it is compatible with module -- 'Bio.Bam'. This gives import of @FastA/FastQ@ while respecting some -- local (to MPI EVAN) conventions. module Bio.Bam.Fastq ( parseFastq, parseFastq', parseFastqCassava ) where import Bio.Bam.Header import Bio.Bam.Rec import Bio.Prelude hiding ( isSpace ) import Bio.Iteratee import Data.Attoparsec.ByteString.Char8 ( char, skipSpace, satisfy, inClass, skipWhile, takeTill , scan, isSpace, isSpace_w8, (<?>) ) import qualified Data.Attoparsec.ByteString.Char8 as P import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as S import qualified Data.Vector.Generic as V -- | Reader for DNA (not protein) sequences in FastA and FastQ. We read -- everything vaguely looking like FastA or FastQ, then shoehorn it into -- a BAM record. We strive to extract information following more or -- less established conventions from the header, but don't aim for -- completeness. The recognized syntactical warts are converted into -- appropriate flags and removed. Only the canonical variant of FastQ -- is supported (qualities stored as raw bytes with offset 33). -- -- Supported additional conventions: -- -- * A name suffix of @/1@ or @/2@ is turned into the first mate or second -- mate flag and the read is flagged as paired. -- -- * Same for name prefixes of @F_@ or @R_@, respectively. -- -- * A name prefix of @M_@ flags the sequence as unpaired and merged -- -- * A name prefix of @T_@ flags the sequence as unpaired and trimmed -- -- * A name prefix of @C_@, optionally before or after any of the other -- prefixes, is turned into the extra flag @XP:i:-1@ (result of -- duplicate removal with unknown duplicate count). -- -- * A collection of tags separated from the name by an octothorpe is -- removed and put into the fields @XI@ and @XJ@ as text. -- -- Everything before the first sequence header is ignored. Headers can -- start with @\>@ or @\@@, we treat both equally. The first word of -- the header becomes the read name, the remainder of the header is -- ignored. The sequence can be split across multiple lines; -- whitespace, dashes and dots are ignored, IUPAC-IUB ambiguity codes -- are accepted as bases, anything else causes an error. The sequence -- ends at a line that is either a header or starts with @\+@, in the -- latter case, that line is ignored and must be followed by quality -- scores. There must be exactly as many Q-scores as there are bases, -- followed immediately by a header or end-of-file. Whitespace is -- ignored. parseFastq :: Monad m => Enumeratee Bytes [ BamRec ] m a parseFastq = parseFastq' (const id) -- | Like 'parseFastq', but also -- -- * If the first word of the description has at least four colon -- separated subfields, the first is used to flag first/second mate, -- the second is the \"QC failed\" flag, and the fourth is the index -- sequence. parseFastqCassava :: Monad m => Enumeratee Bytes [ BamRec ] m a parseFastqCassava = parseFastq' (pdesc . S.split ':' . S.takeWhile (' ' /=)) where pdesc (num:flg:_:idx:_) br = br { b_flag = sum [ if num == "1" then flagFirstMate .|. flagPaired else 0 , if num == "2" then flagSecondMate .|. flagPaired else 0 , if flg == "Y" then flagFailsQC else 0 , b_flag br .&. complement (flagFailsQC .|. flagSecondMate .|. flagPaired) ] , b_exts = if S.all (`S.elem` "ACGTN") idx then insertE "XI" (Text idx) (b_exts br) else b_exts br } pdesc _ br = br -- | Same as 'parseFastq', but a custom function can be applied to the -- description string (the part of the header after the sequence name), -- which can modify the parsed record. Note that the quality field can -- end up empty. parseFastq' :: Monad m => ( Bytes -> BamRec -> BamRec ) -> Enumeratee Bytes [ BamRec ] m a parseFastq' descr it = do skipJunk ; convStream (parserToIteratee $ (:[]) <$> pRec) it where isCBase = inClass "ACGTUBDHVSWMKRYNacgtubdhvswmkryn" canSkip c = isSpace c || c == '.' || c == '-' isHdr c = c == '@' || c == '>' pRec = (satisfy isHdr <?> "start marker") *> (makeRecord <$> pName <*> (descr <$> P.takeWhile ('\n' /=)) <*> (pSeq >>= pQual)) pName = takeTill isSpace <* skipWhile (\c -> c /= '\n' && isSpace c) <?> "read name" pSeq = (:) <$> satisfy isCBase <*> pSeq <|> satisfy canSkip *> pSeq <|> pure [] <?> "sequence" pQual sq = (,) sq <$> (char '+' *> skipWhile ('\n' /=) *> pQual' (length sq) <* skipSpace <|> return S.empty) <?> "qualities" pQual' n = B.filter (not . isSpace_w8) <$> scan n step step 0 _ = Nothing step i c | isSpace c = Just i | otherwise = Just (i-1) skipJunk :: Monad m => Iteratee Bytes m () skipJunk = peekStreamBS >>= check where check (Just c) | bad c = dropWhileStreamBS (c2w '\n' /=) >> dropStreamBS 1 >> skipJunk check _ = return () bad c = c /= c2w '>' && c /= c2w '@' makeRecord :: Seqid -> (BamRec->BamRec) -> (String, Bytes) -> BamRec makeRecord name extra (sq,qual) = extra $ nullBamRec { b_qname = name, b_seq = V.fromList $ read sq, b_qual = V.fromList $ map (Q . subtract 33) $ B.unpack qual }