-- | Parser for @FastA/FastQ@, 'ByteStream' style, written such that it -- works well with module "Bio.Bam". -- -- Input streams are broken into numbered lines, then into records. -- Records can start with empty lines, which are ignored, or random -- junk, which is ignored, but results in a warning, followed by a -- header indicating either a @FastA@ (begins with @\>@ or @;@) or -- @FastQ@ record (begins with @\@@). More description lines begining -- with @;@ are allowed, and silently ignored. All following lines not -- starting with @+@, @\>@, @;@ or @\@@ are sequence lines. (Only) in a -- @FastQ@ record, this is followed by a separator line starting with a -- @+@, which is ignored, and exactly as many quality lines as there -- were sequence lines. A missing separator results in a warning and -- the record being parsed without quality scores. -- -- In sequence lines, IUPAC-IUB ambiguity codes are converted to -- 'Nucleotides', white space is skipped silently. Any other character -- becomes an unknown base ('=' in SAM) and a warning is emitted. Note -- that downstream tools are unlikely to handle the resulting unknown -- bases and/or empty records gracefully. If the quality lines do not -- have the same total length as the sequence lines (this includes -- missing quality lines due to end-of-stream), a warning is emitted, -- and the record receives no quality scores (just as if it was a -- @FastA@ record). Else, if the quality lines have a different layout -- than the sequence lines, a warning is emitted, but they are still -- used. -- -- Quality scores must be stored as raw bytes with offset 33. (Other -- variants, like 454's ASCII qualities and Solexa's raw bytes with -- offset 64 are difficult to detect, and extinct in the wild anyway.) -- If the second word of the header stores multiple fields, we try to -- extract Illumina's \"QC failed\" flag and either an index sequence or -- a read group name from it. -- -- Other flags are commonly encoded into the sequence names. We do not -- handle those here, but most of the conventions at MPI EVAN are dealt -- with by 'Bio.Bam.Evan.removeWarts'. module Bio.Bam.Fastq ( parseFastq , EmptyRecord(..) , IncoherentQualities(..) , IncongruentQualities(..) , JunkFound(..) , QualitiesMissing(..) , SequenceHasGaps(..) ) where import Bio.Bam.Header import Bio.Bam.Rec import Bio.Prelude import Bio.Streaming import Bio.Streaming.Bytes ( lines' ) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as C import qualified Data.ByteString.Unsafe as B import qualified Data.Vector.Storable as W import qualified Streaming.Prelude as Q -- | Emitted when random text is found instead of a header. data JunkFound = JunkFound !Int !Bytes deriving (Typeable, Show) instance Exception JunkFound where displayException (JunkFound n s) = printf "junk found at line %d: %s" n (unpack s) -- | Emitted when a quality separator was expected, but not found. data QualitiesMissing = QualitiesMissing !Int !Bytes deriving (Typeable, Show) instance Exception QualitiesMissing where displayException (QualitiesMissing 0 _) = printf "expected '+' symbol at end of file" displayException (QualitiesMissing n s) = printf "expected '+' symbol at line %d, but found %s" n (unpack s) -- | Emitted when a quality record does not fit the sequence record. data IncoherentQualities = IncoherentQualities !Int !Bytes deriving (Typeable, Show) instance Exception IncoherentQualities where displayException (IncoherentQualities n s) = printf "quality record of incorrect length ignored at line %d (%s)" n (unpack s) -- | Emitted when a quality record has different layout than the -- sequence. data IncongruentQualities = IncongruentQualities !Int !Bytes deriving (Typeable, Show) instance Exception IncongruentQualities where displayException (IncongruentQualities n s) = printf "quality and sequence have different layouts at line %d (%s)" n (unpack s) -- | Emitted when a sequence record contains strange characters data SequenceHasGaps = SequenceHasGaps !Int !Bytes deriving (Typeable, Show) instance Exception SequenceHasGaps where displayException (SequenceHasGaps n cs) = printf "undefined characters %s stored as unknown bases at line %d" (show cs) n data EmptyRecord = EmptyRecord !Int !Bytes deriving (Typeable, Show) instance Exception EmptyRecord where displayException (EmptyRecord n s) = printf "(effectively) empty record at line %d (%s)" n (unpack s) {-# INLINE parseFastq #-} parseFastq :: MonadLog m => ByteStream m r -> Stream (Of BamRec) m r parseFastq = Q.unfoldr go . Q.zip (Q.enumFrom (1::Int)) . lines' where go = inspect >=> \case Left r -> return (Left r) Right ((i,h) :> ls) | B.null h -> go ls | C.head h == '>' || C.head h == ';' -> goFasta (i, B.tail h) (Q.dropWhile isDescr ls) | C.head h == '@' -> goFastq (i, B.tail h) (Q.dropWhile isDescr ls) | otherwise -> logMsg Warning (JunkFound i h) >> go ls isDescr (_,s) = not (B.null s) && C.head s == ';' isHeader (_,s) = not (B.null s) && (C.head s == '>' || C.head s == '@' || C.head s == ';') isSep (_,s) = not (B.null s) && C.head s == '+' isSeq x = not (isHeader x || isSep x) goFasta h ls = do sq :> ls' <- Q.toList $ Q.span isSeq ls make_record h sq Nothing ls' goFastq h ls = do sq :> ls1 <- Q.toList $ Q.span isSeq ls Q.next ls1 >>= \case Right (sep, ls2) | isSep sep -> do qs :> ls3 <- Q.toList $ Q.splitAt (length sq) ls2 if sum (map (B.length . snd) qs) /= sum (map (B.length . snd) sq) then do logMsg Error $ uncurry IncoherentQualities h make_record h sq Nothing ls3 else do when (map (B.length . snd) qs /= map (B.length . snd) sq) $ logMsg Warning $ uncurry IncongruentQualities h make_record h sq (Just qs) ls3 | otherwise -> do logMsg Error $ uncurry QualitiesMissing sep make_record h sq Nothing (Q.cons sep ls2) Left x -> do logMsg Error $ QualitiesMissing 0 C.empty make_record h sq Nothing (pure x) make_record h sq qs k = do when (ngaps > 0) . logMsg Warning $ SequenceHasGaps (fst $ head sq) (B.filter (isGap . toNucleotides) $ B.concat $ map snd sq) when (l == 0) . logMsg Warning $ uncurry EmptyRecord h return $ Right (r,k) where !l = sum $ map (B.length . snd) sq (!nseq, !ngaps) = mkSeq l sq !qual = case qs of Nothing -> Nothing Just [] -> Nothing Just q -> Just $! mkQual l q (!qname, !descr) = B.break (== 32) (snd h) !fflag = B.drop 1 . C.dropWhile (/= ':') $ descr !r = if B.length fflag < 2 || C.index fflag 1 /= ':' || (C.head fflag /= 'Y' && C.head fflag /= 'N') then nullBamRec { b_qname = qname, b_seq = nseq, b_qual = qual } else let !flag | C.head fflag /= 'Y' = b_flag nullBamRec .|. flagFailsQC | otherwise = b_flag nullBamRec !sample = B.takeWhile (/=32). B.drop 1. C.dropWhile (/=':'). B.drop 1. C.dropWhile (/=':') $ fflag !exts | B.null sample = [ ] | C.all (`C.elem` "ACGTN") sample = [("XI", Text sample)] | otherwise = [("RG", Text sample)] in nullBamRec { b_qname = qname, b_seq = nseq, b_qual = qual, b_flag = flag, b_exts = exts } mkSeq :: Int -> [(Int,Bytes)] -> (Vector_Nucs_half Nucleotides, Int) mkSeq ltot xs0 = unsafePerformIO $ do fp <- mallocForeignPtrBytes (shiftR (ltot+1) 1) g <- withForeignPtr fp $ \p -> go_even p 0 xs0 return ( Vector_Nucs_half 0 ltot fp, g ) where go_even !p !g (s:ss) = B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1_even ss p q g l go_even _ !g [ ] = return g go_odd !p !g !a (s:ss) = B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1_odd a ss p q g l go_odd !p !g !a [ ] = poke p a >> return g go1_odd a ss !p !q !g !l | l > 0 = do !b <- unNs . toNucleotides <$> peekByteOff q 0 poke p $ a `shiftL` 4 .|. b go1_even ss (plusPtr p 1) (plusPtr q 1) (g + fromEnum (b == 0)) (l-1) | otherwise = go_odd p g a ss go1_even ss !p !q !g !l | l > 1 = do !a <- unNs . toNucleotides <$> peekByteOff q 0 !b <- unNs . toNucleotides <$> peekByteOff q 1 poke p $ a `shiftL` 4 .|. b go1_even ss (plusPtr p 1) (plusPtr q 2) (g + fromEnum (a == 0) + fromEnum (b == 0)) (l-2) | l > 0 = do !a <- unNs . toNucleotides <$> peekByteOff q 0 go_odd p (g + fromEnum (a == 0)) (a `shiftL` 4) ss | otherwise = go_even p g ss mkQual :: Int -> [(Int, Bytes)] -> W.Vector Qual mkQual ltot qs0 = unsafePerformIO $ do fp <- mallocForeignPtrBytes ltot withForeignPtr fp $ \p -> go (castPtr p) qs0 return $! W.unsafeFromForeignPtr0 fp ltot where go !p (s:ss) = B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1 ss p (castPtr q) l go _ [ ] = return () go1 ss !p !q !l | l > 0 = do peek q >>= poke p . subtract (33::Word8) go1 ss (plusPtr p 1) (plusPtr q 1) (pred l) | otherwise = go p ss