module Bio.Bam.Fastq
( parseFastq
, EmptyRecord(..)
, IncoherentQualities(..)
, IncongruentQualities(..)
, JunkFound(..)
, QualitiesMissing(..)
, SequenceHasGaps(..)
) where
import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude
import Bio.Streaming
import Bio.Streaming.Bytes ( lines' )
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Unsafe as B
import qualified Data.Vector.Storable as W
import qualified Streaming.Prelude as Q
data JunkFound = JunkFound !Int !Bytes deriving (Typeable, Show)
instance Exception JunkFound where
displayException (JunkFound n s) = printf "junk found at line %d: %s" n (unpack s)
data QualitiesMissing = QualitiesMissing !Int !Bytes deriving (Typeable, Show)
instance Exception QualitiesMissing where
displayException (QualitiesMissing 0 _) = printf "expected '+' symbol at end of file"
displayException (QualitiesMissing n s) = printf "expected '+' symbol at line %d, but found %s" n (unpack s)
data IncoherentQualities = IncoherentQualities !Int !Bytes deriving (Typeable, Show)
instance Exception IncoherentQualities where
displayException (IncoherentQualities n s) = printf "quality record of incorrect length ignored at line %d (%s)" n (unpack s)
data IncongruentQualities = IncongruentQualities !Int !Bytes deriving (Typeable, Show)
instance Exception IncongruentQualities where
displayException (IncongruentQualities n s) = printf "quality and sequence have different layouts at line %d (%s)" n (unpack s)
data SequenceHasGaps = SequenceHasGaps !Int !Bytes deriving (Typeable, Show)
instance Exception SequenceHasGaps where
displayException (SequenceHasGaps n cs) = printf "undefined characters %s stored as unknown bases at line %d" (show cs) n
data EmptyRecord = EmptyRecord !Int !Bytes deriving (Typeable, Show)
instance Exception EmptyRecord where
displayException (EmptyRecord n s) = printf "(effectively) empty record at line %d (%s)" n (unpack s)
{-# INLINE parseFastq #-}
parseFastq :: MonadLog m => ByteStream m r -> Stream (Of BamRec) m r
parseFastq = Q.unfoldr go . Q.zip (Q.enumFrom (1::Int)) . lines'
where
go = inspect >=> \case
Left r -> return (Left r)
Right ((i,h) :> ls)
| B.null h -> go ls
| C.head h == '>' || C.head h == ';' -> goFasta (i, B.tail h) (Q.dropWhile isDescr ls)
| C.head h == '@' -> goFastq (i, B.tail h) (Q.dropWhile isDescr ls)
| otherwise -> logMsg Warning (JunkFound i h) >> go ls
isDescr (_,s) = not (B.null s) && C.head s == ';'
isHeader (_,s) = not (B.null s) && (C.head s == '>' || C.head s == '@' || C.head s == ';')
isSep (_,s) = not (B.null s) && C.head s == '+'
isSeq x = not (isHeader x || isSep x)
goFasta h ls = do
sq :> ls' <- Q.toList $ Q.span isSeq ls
make_record h sq Nothing ls'
goFastq h ls = do
sq :> ls1 <- Q.toList $ Q.span isSeq ls
Q.next ls1 >>= \case
Right (sep, ls2)
| isSep sep -> do
qs :> ls3 <- Q.toList $ Q.splitAt (length sq) ls2
if sum (map (B.length . snd) qs) /= sum (map (B.length . snd) sq) then do
logMsg Error $ uncurry IncoherentQualities h
make_record h sq Nothing ls3
else do
when (map (B.length . snd) qs /= map (B.length . snd) sq) $
logMsg Warning $ uncurry IncongruentQualities h
make_record h sq (Just qs) ls3
| otherwise -> do
logMsg Error $ uncurry QualitiesMissing sep
make_record h sq Nothing (Q.cons sep ls2)
Left x -> do
logMsg Error $ QualitiesMissing 0 C.empty
make_record h sq Nothing (pure x)
make_record h sq qs k = do
when (ngaps > 0) . logMsg Warning $
SequenceHasGaps (fst $ head sq) (B.filter (isGap . toNucleotides) $ B.concat $ map snd sq)
when (l == 0) . logMsg Warning $ uncurry EmptyRecord h
return $ Right (r,k)
where
!l = sum $ map (B.length . snd) sq
(!nseq, !ngaps) = mkSeq l sq
!qual = case qs of Nothing -> Nothing
Just [] -> Nothing
Just q -> Just $! mkQual l q
(!qname, !descr) = B.break (== 32) (snd h)
!fflag = B.drop 1 . C.dropWhile (/= ':') $ descr
!r = if B.length fflag < 2 || C.index fflag 1 /= ':' || (C.head fflag /= 'Y' && C.head fflag /= 'N')
then nullBamRec { b_qname = qname, b_seq = nseq, b_qual = qual }
else let !flag | C.head fflag /= 'Y' = b_flag nullBamRec .|. flagFailsQC
| otherwise = b_flag nullBamRec
!sample = B.takeWhile (/=32). B.drop 1. C.dropWhile (/=':'). B.drop 1. C.dropWhile (/=':') $ fflag
!exts | B.null sample = [ ]
| C.all (`C.elem` "ACGTN") sample = [("XI", Text sample)]
| otherwise = [("RG", Text sample)]
in nullBamRec { b_qname = qname, b_seq = nseq, b_qual = qual, b_flag = flag, b_exts = exts }
mkSeq :: Int -> [(Int,Bytes)] -> (Vector_Nucs_half Nucleotides, Int)
mkSeq ltot xs0 = unsafePerformIO $ do
fp <- mallocForeignPtrBytes (shiftR (ltot+1) 1)
g <- withForeignPtr fp $ \p -> go_even p 0 xs0
return ( Vector_Nucs_half 0 ltot fp, g )
where
go_even !p !g (s:ss) = B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1_even ss p q g l
go_even _ !g [ ] = return g
go_odd !p !g !a (s:ss) = B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1_odd a ss p q g l
go_odd !p !g !a [ ] = poke p a >> return g
go1_odd a ss !p !q !g !l
| l > 0 = do !b <- unNs . toNucleotides <$> peekByteOff q 0
poke p $ a `shiftL` 4 .|. b
go1_even ss (plusPtr p 1) (plusPtr q 1) (g + fromEnum (b == 0)) (l-1)
| otherwise = go_odd p g a ss
go1_even ss !p !q !g !l
| l > 1 = do !a <- unNs . toNucleotides <$> peekByteOff q 0
!b <- unNs . toNucleotides <$> peekByteOff q 1
poke p $ a `shiftL` 4 .|. b
go1_even ss (plusPtr p 1) (plusPtr q 2) (g + fromEnum (a == 0) + fromEnum (b == 0)) (l-2)
| l > 0 = do !a <- unNs . toNucleotides <$> peekByteOff q 0
go_odd p (g + fromEnum (a == 0)) (a `shiftL` 4) ss
| otherwise = go_even p g ss
mkQual :: Int -> [(Int, Bytes)] -> W.Vector Qual
mkQual ltot qs0 = unsafePerformIO $ do
fp <- mallocForeignPtrBytes ltot
withForeignPtr fp $ \p -> go (castPtr p) qs0
return $! W.unsafeFromForeignPtr0 fp ltot
where
go !p (s:ss) = B.unsafeUseAsCStringLen (snd s) $ \(q,l) -> go1 ss p (castPtr q) l
go _ [ ] = return ()
go1 ss !p !q !l
| l > 0 = do peek q >>= poke p . subtract (33::Word8)
go1 ss (plusPtr p 1) (plusPtr q 1) (pred l)
| otherwise = go p ss