-- | Quality filters adapted from prehistoric pipeline. module Bio.Bam.Filter ( filterPairs, QualFilter, complexSimple, complexEntropy, qualityAverage, qualityMinimum, qualityFromOldIllumina, qualityFromNewIllumina ) where import Bio.Bam.Header import Bio.Bam.Rec import Bio.Base import Bio.Iteratee import Data.Bits import Prelude import qualified Data.Vector.Generic as V -- | A filter/transformation applied to pairs of reads. We supply a -- predicate to be applied to single reads and one to be applied to -- pairs, tha latter can get incomplete pairs, too, if mates have been -- separated or filtered asymmetrically. filterPairs :: Monad m => (BamRec -> [BamRec]) -> (Maybe BamRec -> Maybe BamRec -> [BamRec]) -> Enumeratee [BamRec] [BamRec] m a filterPairs ps pp = eneeCheckIfDone step where step k = tryHead >>= step' k step' k Nothing = return $ liftI k step' k (Just b) | isPaired b = tryHead >>= step'' k b | otherwise = case ps b of [] -> step k ; b' -> eneeCheckIfDone step . k $ Chunk b' step'' k b Nothing = case pp (Just b) Nothing of [] -> return $ liftI k b' -> return $ k $ Chunk b' step'' k b (Just c) | b_rname b /= b_rname c || not (isPaired c) = let b' = if isSecondMate b then pp Nothing (Just b) else pp (Just b) Nothing in case b' of [] -> step' k (Just c) _ -> eneeCheckIfDone (\k' -> step' k' (Just c)) . k $ Chunk b' | isFirstMate c && isSecondMate b = step''' k c b | otherwise = step''' k b c step''' k b c = case pp (Just b) (Just c) of [] -> step k b' -> eneeCheckIfDone step . k $ Chunk b' -- | A quality filter is simply a transformation on @BamRec@s. By -- convention, quality filters should set @flagFailsQC@, a further step -- can then remove the failed reads. Filtering of individual reads -- tends to result in mate pairs with inconsistent flags, which in turn -- will result in lone mates and all sort of troubles with programs that -- expect non-broken BAM files. It is therefore recommended to use -- @pairFilter@ with suitable predicates to do the post processing. type QualFilter = BamRec -> BamRec {-# INLINE count #-} count :: (V.Vector v a, Eq a) => a -> v a -> Int count x = V.foldl' (\acc y -> if x == y then acc+1 else acc) 0 -- | Simple complexity filter aka "Nancy Filter". A read is considered -- not-sufficiently-complex if the most common base accounts for greater -- than the @cutoff@ fraction of all non-N bases. {-# INLINE complexSimple #-} complexSimple :: Double -> QualFilter complexSimple r b = if p then b else b' where b' = setQualFlag 'C' $ b { b_flag = b_flag b .|. flagFailsQC } p = let counts = [ count x $ b_seq b | x <- properBases ] lim = floor $ r * fromIntegral (sum counts) in all (<= lim) counts -- | Filter on order zero empirical entropy. Entropy per base must be -- greater than cutoff. {-# INLINE complexEntropy #-} complexEntropy :: Double -> QualFilter complexEntropy r b = if p then b else b' where b' = setQualFlag 'C' $ b { b_flag = b_flag b .|. flagFailsQC } p = ent >= r * total counts = [ count x $ b_seq b | x <- properBases ] total = fromIntegral $ V.length $ b_seq b ent = sum [ fromIntegral c * log (total / fromIntegral c) | c <- counts, c /= 0 ] / log 2 -- | Filter on average quality. Reads without quality string pass. {-# INLINE qualityAverage #-} qualityAverage :: Int -> QualFilter qualityAverage q b = if p then b else b' where b' = setQualFlag 'Q' $ b { b_flag = b_flag b .|. flagFailsQC } p = let total = V.foldl' (\a x -> a + fromIntegral (unQ x)) 0 $ b_qual b in total >= q * V.length (b_qual b) -- | Filter on minimum quality. In @qualityMinimum n q@, a read passes -- if it has no more than @n@ bases with quality less than @q@. Reads -- without quality string pass. {-# INLINE qualityMinimum #-} qualityMinimum :: Int -> Qual -> QualFilter qualityMinimum n (Q q) b = if p then b else b' where b' = setQualFlag 'Q' $ b { b_flag = b_flag b .|. flagFailsQC } p = V.length (V.filter (< Q q) (b_qual b)) <= n -- | Convert quality scores from old Illumina scale (different formula -- and offset 64 in FastQ). qualityFromOldIllumina :: BamRec -> BamRec qualityFromOldIllumina b = b { b_qual = V.map conv $ b_qual b } where conv (Q s) = let s' :: Double s' = exp $ log 10 * (fromIntegral s - 31) / (-10) p = s' / (1+s') q = - 10 * log p / log 10 in Q (round q) -- | Convert quality scores from new Illumina scale (standard formula -- but offset 64 in FastQ). qualityFromNewIllumina :: BamRec -> BamRec qualityFromNewIllumina b = b { b_qual = V.map (Q . subtract 31 . unQ) $ b_qual b }