-- | Quality filters adapted from prehistoric pipeline.

module Bio.Bam.Filter (
    filterPairs, QualFilter,
    complexSimple, complexEntropy,
    qualityAverage, qualityMinimum,
    qualityFromOldIllumina, qualityFromNewIllumina
                      ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude
import Bio.Streaming

import qualified Data.Vector.Generic as V

-- | A filter/transformation applied to pairs of reads.  We supply a
-- predicate to be applied to single reads and one to be applied to
-- pairs, the latter can get incomplete pairs, too, if mates have been
-- separated or filtered asymmetrically.  This fails spectacularly if
-- the input isn't grouped by name.

filterPairs :: Monad m => (BamRec -> [BamRec])
                       -> (Maybe BamRec -> Maybe BamRec -> [BamRec])
                       -> Stream (Of BamRec) m r -> Stream (Of BamRec) m r
filterPairs ps pp = step
  where
    step = lift . inspect >=> either pure step'
    step' (b :> s)
        | isPaired b = lift (inspect s) >>= step'' b
        | otherwise  = each (ps b) >> step s

    step'' b (Left r) = each (pp (Just b) Nothing) >> pure r

    step'' b (Right (c :> s))
        | b_rname b /= b_rname c || not (isPaired c) =
                let b' = if isSecondMate b then pp Nothing (Just b) else pp (Just b) Nothing
                in each b' >> step' (c :> s)

        | isFirstMate c && isSecondMate b = step''' c b s
        | otherwise                       = step''' b c s

    step''' b c s = each (pp (Just b) (Just c)) >> step s


-- | A quality filter is simply a transformation on @BamRec@s.  By
-- convention, quality filters should set @flagFailsQC@, a further step
-- can then remove the failed reads.  Filtering of individual reads
-- tends to result in mate pairs with inconsistent flags, which in turn
-- will result in lone mates and all sort of troubles with programs that
-- expect non-broken BAM files.  It is therefore recommended to use
-- @pairFilter@ with suitable predicates to do the post processing.

type QualFilter = BamRec -> BamRec

{-# INLINE count #-}
count :: (V.Vector v a, Eq a) => a -> v a -> Int
count x = V.foldl' (\acc y -> if x == y then acc+1 else acc) 0

-- | Simple complexity filter aka "Nancy Filter".  A read is considered
-- not-sufficiently-complex if the most common base accounts for greater
-- than the @cutoff@ fraction of all non-N bases.
{-# INLINE complexSimple #-}
complexSimple :: Double -> QualFilter
complexSimple r b = if p then b else b'
  where
    b' = setQualFlag 'C' $ b { b_flag = b_flag b .|. flagFailsQC }
    p  = let counts = [ count x $ b_seq b | x <- properBases ]
             lim = floor $ r * fromIntegral (sum counts)
         in all (<= lim) counts

-- | Filter on order zero empirical entropy.  Entropy per base must be
-- greater than cutoff.
{-# INLINE complexEntropy #-}
complexEntropy :: Double -> QualFilter
complexEntropy r b = if p then b else b'
  where
    b' = setQualFlag 'C' $ b { b_flag = b_flag b .|. flagFailsQC }
    p = ent >= r * total

    counts = [ count x $ b_seq b | x <- properBases ]
    total = fromIntegral $ V.length $ b_seq b
    ent   = sum [ fromIntegral c * log (total / fromIntegral c) | c <- counts, c /= 0 ] / log 2

-- | Filter on average quality.  Reads without qualities pass.
{-# INLINE qualityAverage #-}
qualityAverage :: Int -> QualFilter
qualityAverage q b = if p then b else b'
  where
    b' = setQualFlag 'Q' $ b { b_flag = b_flag b .|. flagFailsQC }
    p  = case b_qual b of Nothing -> True
                          Just qs -> let total = V.foldl' (\a x -> a + fromIntegral (unQ x)) 0 qs
                                     in total >= q * V.length qs

-- | Filter on minimum quality.  In @qualityMinimum n q@, a read passes
-- if it has no more than @n@ bases with quality less than @q@.  Reads
-- without qualities pass.
{-# INLINE qualityMinimum #-}
qualityMinimum :: Int -> Qual -> QualFilter
qualityMinimum n (Q q) b = if p then b else b'
  where
    b' = setQualFlag 'Q' $ b { b_flag = b_flag b .|. flagFailsQC }
    p  = case b_qual b of Nothing -> True
                          Just qs -> V.length (V.filter (< Q q) qs) <= n


-- | Convert quality scores from old Illumina scale (different formula
-- and offset 64 in FastQ).
qualityFromOldIllumina :: BamRec -> BamRec
qualityFromOldIllumina b = b { b_qual = V.map conv <$> b_qual b }
  where
    conv (Q s) = let s' :: Double
                     s' = exp $ log 10 * (fromIntegral s - 31) / (-10)
                     p  = s' / (1+s')
                     q  = - 10 * log p / log 10
                 in Q (round q)

-- | Convert quality scores from new Illumina scale (standard formula
-- but offset 64 in FastQ).
qualityFromNewIllumina :: BamRec -> BamRec
qualityFromNewIllumina b = b { b_qual = V.map (Q . subtract 31 . unQ) <$> b_qual b }