module Bio.Bam.Fastq ( parseFastq, parseFastqWith, parseFastqCassava ) where
import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude hiding ( isSpace )
import Bio.Streaming
import Bio.Streaming.Parse ( parse, atto )
import Data.Attoparsec.ByteString.Char8
( char, skipSpace, satisfy, inClass, skipWhile, takeTill
, scan, isSpace, isSpace_w8, (<?>) )
import qualified Data.Attoparsec.ByteString.Char8 as P
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
import qualified Data.Vector.Generic as V
import qualified Streaming.Prelude as Q
import qualified Bio.Streaming.Bytes as S
parseFastq :: Monad m => ByteStream m r -> Stream (Of BamRec) m (Either SomeException r)
parseFastq = parseFastqWith (const id)
parseFastqCassava :: Monad m => ByteStream m r -> Stream (Of BamRec) m (Either SomeException r)
parseFastqCassava = parseFastqWith (pdesc . C.split ':' . C.takeWhile (' ' /=))
where
pdesc (num:flg:_:idx:_) br = br { b_flag = sum [ if num == "1" then flagFirstMate .|. flagPaired else 0
, if num == "2" then flagSecondMate .|. flagPaired else 0
, if flg == "Y" then flagFailsQC else 0
, b_flag br .&. complement (flagFailsQC .|. flagSecondMate .|. flagPaired) ]
, b_exts = if C.all (`C.elem` "ACGTN") idx then insertE "XI" (Text idx) (b_exts br) else b_exts br }
pdesc _ br = br
parseFastqWith :: Monad m => (Bytes -> BamRec -> BamRec) -> ByteStream m r -> Stream (Of BamRec) m (Either SomeException r)
parseFastqWith descr = Q.unfoldr (liftM twiddle . parse (const $ atto pRec)) . skipJunk
where
twiddle (Left e) = Left (Left e)
twiddle (Right (Left r)) = Left (Right r)
twiddle (Right (Right a)) = Right a
isCBase = inClass "ACGTUBDHVSWMKRYNacgtubdhvswmkryn"
canSkip c = isSpace c || c == '.' || c == '-'
isHdr c = c == '@' || c == '>'
pRec = (satisfy isHdr <?> "start marker") *> (makeRecord <$> pName <*> (descr <$> P.takeWhile ('\n' /=)) <*> (pSeq >>= pQual))
pName = takeTill isSpace <* skipWhile (\c -> c /= '\n' && isSpace c) <?> "read name"
pSeq = (:) <$> satisfy isCBase <*> pSeq
<|> satisfy canSkip *> pSeq
<|> pure [] <?> "sequence"
pQual sq = (,) sq <$> (char '+' *> skipWhile ('\n' /=) *> pQual' (length sq) <* skipSpace <|> return C.empty) <?> "qualities"
pQual' n = B.filter (not . isSpace_w8) <$> scan n step
step 0 _ = Nothing
step i c | isSpace c = Just i
| otherwise = Just (i-1)
skipJunk :: Monad m => ByteStream m r -> ByteStream m r
skipJunk = lift . S.nextByte >=> check
where
check (Right (c,s)) | bad c = skipJunk . S.drop 1 . S.dropWhile (c2w '\n' /=) $ s
| otherwise = S.cons c s
check (Left r) = pure r
bad c = c /= c2w '>' && c /= c2w '@'
makeRecord :: Bytes -> (BamRec->BamRec) -> (String, Bytes) -> BamRec
makeRecord name extra (sq,qual) = extra $ nullBamRec
{ b_qname = name, b_seq = V.fromList $ read sq, b_qual = V.fromList $ map (Q . subtract 33) $ B.unpack qual }