-- | Trimming of reads as found in BAM files.  Implements trimming low
-- quality sequence from the 3' end.

module Bio.Bam.Trim where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Bam.Rmdup        ( ECig(..), setMD, toECig )
import Bio.Iteratee
import Bio.Prelude

import qualified Data.ByteString                        as B
import qualified Data.Vector.Fusion.Bundle.Size         as S
import qualified Data.Vector.Fusion.Bundle.Monadic      as S
import qualified Data.Vector.Fusion.Stream.Monadic      as SS
import qualified Data.Vector.Fusion.Util                as SS
import qualified Data.Vector.Hybrid.Internal            as Hybrid
import qualified Data.Vector.Generic                    as V
import qualified Data.Vector.Unboxed                    as U

-- | Trims from the 3' end of a sequence.
-- @trim_3\' p b@ trims the 3' end of the sequence in @b@ at the
-- earliest position such that @p@ evaluates to true on every suffix
-- that was trimmed off.  Note that the 3' end may be the beginning of
-- the sequence if it happens to be stored in reverse-complemented form.
-- Also note that trimming from the 3' end may not make sense for reads
-- that were constructed by merging paired end data (but we cannot take
-- care of that here).  Further note that trimming may break dependent
-- information, notably the "mate" information of the mate and many
-- optional fields.

trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec
trim_3' p b | b_flag b `testBit` 4 = trim_rev
            | otherwise            = trim_fwd
  where
    trim_fwd = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $
                            zip (inits . reverse . V.toList $ b_seq b)
                                (inits . reverse . V.toList $ b_qual b)
               in trim_3 l b

    trim_rev = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $
                            zip (inits . V.toList $ b_seq  b)
                                (inits . V.toList $ b_qual b)
               in trim_3 l b

trim_3 :: Int -> BamRec -> BamRec
trim_3 l b | b_flag b `testBit` 4 = trim_rev
           | otherwise            = trim_fwd
  where
    trim_fwd = let (_, cigar') = trim_back_cigar (b_cigar b) l
                   c = modMd (takeECig (V.length (b_seq  b) - l)) b
               in c { b_seq   = V.take (V.length (b_seq  c) - l) (b_seq  c)
                    , b_qual  = V.take (V.length (b_qual c) - l) (b_qual c)
                    , b_cigar = cigar'
                    , b_exts  = map (\(k,e) -> case e of
                                        Text t | k `elem` trim_set
                                          -> (k, Text (B.take (B.length t - l) t))
                                        _ -> (k,e)
                                    ) (b_exts c) }

    trim_rev = let (off, cigar') = trim_fwd_cigar (b_cigar b) l
                   c = modMd (dropECig l) b
               in c { b_seq   = V.drop l (b_seq  c)
                    , b_qual  = V.drop l (b_qual c)
                    , b_pos   = b_pos c + off
                    , b_cigar = cigar'
                    , b_exts  = map (\(k,e) -> case e of
                                        Text t | k `elem` trim_set
                                          -> (k, Text (B.drop l t))
                                        _ -> (k,e)
                                    ) (b_exts c) }

    trim_set = ["BQ","CQ","CS","E2","OQ","U2"]

    modMd :: (ECig -> ECig) -> BamRec -> BamRec
    modMd f br = maybe br (setMD br . f . toECig (b_cigar br)) (getMd br)

    endOf :: ECig -> ECig
    endOf  WithMD     = WithMD
    endOf  WithoutMD  = WithoutMD
    endOf (Mat' _ es) = endOf es
    endOf (Ins' _ es) = endOf es
    endOf (SMa' _ es) = endOf es
    endOf (Rep' _ es) = endOf es
    endOf (Del' _ es) = endOf es
    endOf (Nop' _ es) = endOf es
    endOf (HMa' _ es) = endOf es
    endOf (Pad' _ es) = endOf es

    takeECig :: Int -> ECig -> ECig
    takeECig 0  es          = endOf es
    takeECig _  WithMD      = WithMD
    takeECig _  WithoutMD   = WithoutMD
    takeECig n (Mat' m  es) = Mat' n  $ if n > m then takeECig (n-m) es else WithMD
    takeECig n (Ins' m  es) = Ins' n  $ if n > m then takeECig (n-m) es else WithMD
    takeECig n (SMa' m  es) = SMa' n  $ if n > m then takeECig (n-m) es else WithMD
    takeECig n (Rep' ns es) = Rep' ns $ takeECig (n-1) es
    takeECig n (Del' ns es) = Del' ns $ takeECig n es
    takeECig n (Nop' m  es) = Nop' m  $ takeECig n es
    takeECig n (HMa' m  es) = HMa' m  $ takeECig n es
    takeECig n (Pad' m  es) = Pad' m  $ takeECig n es

    dropECig :: Int -> ECig -> ECig
    dropECig 0  es         = es
    dropECig _  WithMD     = WithMD
    dropECig _  WithoutMD  = WithoutMD
    dropECig n (Mat' m es) = if n > m then dropECig (n-m) es else Mat' n WithMD
    dropECig n (Ins' m es) = if n > m then dropECig (n-m) es else Ins' n WithMD
    dropECig n (SMa' m es) = if n > m then dropECig (n-m) es else SMa' n WithMD
    dropECig n (Rep' _ es) = dropECig (n-1) es
    dropECig n (Del' _ es) = dropECig n es
    dropECig n (Nop' _ es) = dropECig n es
    dropECig n (HMa' _ es) = dropECig n es
    dropECig n (Pad' _ es) = dropECig n es


trim_back_cigar, trim_fwd_cigar :: V.Vector v Cigar => v Cigar -> Int -> ( Int, v Cigar )
trim_back_cigar c l = (o, V.fromList $ reverse c') where (o,c') = sanitize_cigar . trim_cigar l $ reverse $ V.toList c
trim_fwd_cigar  c l = (o, V.fromList           c') where (o,c') = sanitize_cigar $ trim_cigar l $ V.toList c

sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar (o, [        ])                          = (o, [])
sanitize_cigar (o, (op:*l):xs) | op == Pad              = sanitize_cigar (o,xs)         -- del P
                               | op == Del || op == Nop = sanitize_cigar (o + l, xs)    -- adjust D,N
                               | op == Ins              = (o, (SMa :* l):xs)            -- I --> S
                               | otherwise              = (o, (op :* l):xs)             -- rest is fine

trim_cigar :: Int -> [Cigar] -> (Int, [Cigar])
trim_cigar 0 cs = (0, cs)
trim_cigar _ [] = (0, [])
trim_cigar l ((op:*ll):cs) | bad_op op = let (o,cs') = trim_cigar l cs in (o + reflen op ll, cs')
                           | otherwise = case l `compare` ll of
    LT -> (reflen op  l, (op :* (ll-l)):cs)
    EQ -> (reflen op ll,                cs)
    GT -> let (o,cs') = trim_cigar (l - ll) cs in (o + reflen op ll, cs')

  where
    reflen op' = if ref_op op' then id else const 0
    bad_op o = o /= Mat && o /= Ins && o /= SMa
    ref_op o = o == Mat || o == Del


-- | Trim predicate to get rid of low quality sequence.
-- @trim_low_quality q ns qs@ evaluates to true if all qualities in @qs@
-- are smaller (i.e. worse) than @q@.
trim_low_quality :: Qual -> a -> [Qual] -> Bool
trim_low_quality q = const $ all (< q)


-- | Overlap-merging of read pairs.  We shall compute the likelihood
-- for every possible overlap, then select the most likely one (unless it
-- looks completely random), compute a quality from the second best
-- merge, then merge and clamp the quality accordingly.
-- (We could try looking for chimaera after completing the merge, if
-- only we knew which ones to expect?)
--
-- Two reads go in, with two adapter lists.  We return 'Nothing' if all
-- merges looked mostly random.  Else we return the two original reads,
-- flagged as 'eflagVestigial' *and* the merged version, flagged as
-- 'eflagMerged' and optionally 'eflagTrimmed'.  All reads contain the
-- computed qualities (in YM and YN), which we also return.
--
-- The merging automatically limits quality scores some of the time.  We
-- additionally impose a hard limit of 63 to avoid difficulties
-- representing the result, and even that is ridiculous.  Sane people
-- would further limit the returned quality!  (In practice, map quality
-- later imposes a limit anyway, so no worries...)

merge_overlap :: BamRec -> [ U.Vector Nucleotides ]
              -> BamRec -> [ U.Vector Nucleotides ]
              -> Maybe ( BamRec, BamRec, BamRec, Int, Int )
merge_overlap r1 ads1 r2 ads2 =
    case possible_merges of
        [                             ] -> Nothing
        (score,  len) : [             ] -> result len score  plain_score
        (score1, len) : (score2, _) : _ -> result len score1 score2
  where
    rq1 = Hybrid.V (b_seq r1) (b_qual r1)
    rq2 = Hybrid.V (b_seq r2) (b_qual r2)

    -- the "merge" score if there is no overlap
    plain_score = 6 * fromIntegral (V.length rq1 + V.length rq2)

    possible_merges = sortBy (\a b -> fst a `compare` fst b)
                                   [ ( merge_score ads1 ads2 rq1 rq2 l, l )
                                   | l <- [0 .. V.length rq1 + V.length rq2 - 1] ]

    flag_vestigial    br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagVestigial) $ b_exts br }
    store_quals s1 s2 br = br { b_exts = updateE "YM" (Int $ s2          - s1) $
                                         updateE "YN" (Int $ plain_score - s1) $ b_exts br }

    result l s1 s2 = Just ( store_quals s1 s2 $ flag_vestigial r1
                          , store_quals s1 s2 $ flag_vestigial r2
                          , store_quals s1 s2 $ merged_read l (fromIntegral . min 63 $ s2-s1)
                          , s2 - s1, plain_score - s1 )

    merged_read l qmax
        | V.length merged_seq /= l = error $ "Logic error in merged_read: " ++ show (V.length merged_seq, l)
        | otherwise = nullBamRec {
                b_qname = b_qname r1,
                b_flag  = flagUnmapped .|. complement pair_flags .&. b_flag r1,
                b_seq   = V.unstream $ flip S.fromStream (S.Exact $ V.length merged_seq) $ SS.map fst $ S.elements $ V.stream merged_seq,
                b_qual  = V.unstream $ flip S.fromStream (S.Exact $ V.length merged_seq) $ SS.map snd $ S.elements $ V.stream merged_seq,
                b_exts  = let ff = if l < V.length (b_seq r1) then eflagTrimmed else 0
                          in updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagMerged .|. ff) $ b_exts r1 }
      where
        merged_seq = V.concat
                [ V.take (l - V.length rq2) rq1
                , merge_seqs qmax        (V.take l $ V.drop (l - V.length rq2) rq1)
                             (V.reverse $ V.take l $ V.drop (l - V.length rq1) rq2)
                , V.reverse $ V.take (l - V.length rq1) rq2 ]

    pair_flags = flagPaired.|.flagProperlyPaired.|.flagMateUnmapped.|.flagMateReversed.|.flagFirstMate.|.flagSecondMate

    merge_seqs qmax = V.zipWith $ \(!n1,!(Q q1)) (!n2,!(Q q2)) ->
            if     n1 == n2 then (n1, Q $ min qmax (q1 + q2))
            else if q1 > q2 then (n1, Q $           q1 - q2 )
            else                 (n2, Q $           q2 - q1 )


-- | Trimming for a single read:  we need one adapter only (the one coming
-- /after/ the read), here provided as a list of options, and then we
-- merge with an empty second read.  Results in up to two reads (the
-- original, possibly flagged, and the trimmed one, definitely flagged,
-- and two qualities).
trim_adapter :: BamRec -> [ U.Vector Nucleotides ] -> Maybe ( BamRec, BamRec, Int, Int )
trim_adapter r1 ads1 =
    case possible_trims of
        [                             ] -> Nothing
        (score,  len) : [             ] -> result len score  plain_score
        (score1, len) : (score2, _) : _ -> result len score1 score2
  where
    rq1 = Hybrid.V (b_seq r1) (b_qual r1)

    -- the "merge" score if there is no trimming
    plain_score = 6 * fromIntegral (V.length rq1)

    possible_trims = sortBy (\a b -> fst a `compare` fst b)
                                  [ ( merge_score ads1 [V.empty] rq1 V.empty l, l )
                                  | l <- [0 .. V.length rq1 - 1] ]

    flag_vestigial    br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagVestigial) $ b_exts br }
    store_quals s1 s2 br = br { b_exts = updateE "YM" (Int $ s2          - s1) $
                                         updateE "YN" (Int $ plain_score - s1) $ b_exts br }

    result l s1 s2 = Just ( store_quals s1 s2 $ flag_vestigial r1
                          , store_quals s1 s2 $ trimmed_read l
                          , s2 - s1, plain_score - s1 )

    trimmed_read l = nullBamRec {
            b_qname = b_qname r1,
            b_flag  = flagUnmapped .|. b_flag r1,
            b_seq   = V.take l $ b_seq  r1,
            b_qual  = V.take l $ b_qual r1,
            b_exts  = updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagTrimmed) $ b_exts r1 }


-- | For merging, we don't need the complete adapters (length around 70!),
-- only a sufficient prefix.  Taking only the more-or-less constant
-- part (length around 30), there aren't all that many different
-- adapters in the world.  To deal with pretty much every library, we
-- only need the following forward adapters, which will be the default
-- (defined here in the direction they would be sequenced in):  Genomic
-- R2, Multiplex R2, Fraft P7.

default_fwd_adapters :: [ U.Vector Nucleotides ]
default_fwd_adapters = map (U.fromList. map toNucleotides)
         [ {- Genomic R2   -}  "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG"
         , {- Multiplex R2 -}  "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
         , {- Graft P7     -}  "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG" ]

-- | Like 'default_rev_adapters', these are the few adapters needed for
-- the reverse read (defined in the direction they would be sequenced in
-- as part of the second read):  Genomic R1, CL 72.

default_rev_adapters :: [ U.Vector Nucleotides ]
default_rev_adapters = map (U.fromList. map toNucleotides)
         [ {- Genomic_R1   -}  "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
         , {- CL72         -}  "GGAAGAGCGTCGTGTAGGGAAAGAGTGT" ]

-- We need to compute the likelihood of a read pair given an assumed
-- insert length.  The likelihood of the first read is the likelihood of
-- a match with the adapter where it overlaps the 3' adapter, elsewhere
-- it's 1/4 per position.  The likelihood of the second read is the
-- likelihood of a match with the adapter where it overlaps the adapter,
-- the likehood of a read-read match where it overlaps read one, 1/4 per
-- position elsewhere.  (Yes, this ignores base composition.  It doesn't
-- matter enough.)

merge_score
    :: ( V.Vector v Nucleotides, V.Vector u (Nucleotides, Qual) )
    => [ v Nucleotides ]         -- 3' adapters as they appear in the first read
    -> [ v Nucleotides ]         -- 5' adapters as they appear in the second read
    -> u (Nucleotides, Qual)            -- first read
    -> u (Nucleotides, Qual)           -- second read
    -> Int                          -- assumed insert length
    -> Int                          -- score (roughly sum of qualities at mismatches)
merge_score fwd_adapters rev_adapters read1 read2 l
    =   6 * fromIntegral (l `min` V.length read1)                                           -- read1, part before adapter
      + 6 * fromIntegral (max 0 (l - V.length read1))                                       -- read2, part before overlap

      + minimum [ match_adapter (V.drop l read1) fwd_ad                                     -- read1, match with forward adapter
                + 6 * fromIntegral (max 0 (V.length read1 - V.length fwd_ad - l))           -- read1, part after (known) adapter
                | fwd_ad <- fwd_adapters ]

      + minimum [ match_adapter (V.drop l read2) rev_ad                                     -- read2, match with reverse adapter
                + 6 * fromIntegral (max 0 (V.length read2 - V.length rev_ad - l))           -- read2, part after (known) adapter
                | rev_ad <- rev_adapters ]

      + match_reads (V.take l $ V.drop (l - V.length read2) read1)
                    (V.take l $ V.drop (l - V.length read1) read2)                          -- read2, overlap with read1
  where
    -- match_adapter :: u (Nucleotides, Qual) -> v Nucleotides -> Double
    match_adapter rd ad = SS.unId $ SS.foldl' (+) 0 $
                          SS.zipWith (\(!n, Q !q) m -> if n == m then 0 else min 25 (fromIntegral q))
                                     (S.elements $ V.stream rd) (S.elements $ V.stream ad)

    -- match_reads :: u (Nucleotides, Qual) -> u (Nucleotides, Qual) -> Double
    match_reads rd1 rd2 = SS.unId $ SS.foldl' (+) 0 $
                          SS.zipWith (\(!n1, Q !q1) (!n2, Q !q2) -> if n1 == compls n2 then 0 else fromIntegral $ min q1 q2)
                                     (S.elements $ V.stream rd1) (S.elements $ V.streamR rd2)

mergeTrimBam :: Monad m => [U.Vector Nucleotides] -> [U.Vector Nucleotides] -> Enumeratee [BamRec] [BamRec] m a
mergeTrimBam fwd_ads rev_ads = convStream go
  where
    go = do r1 <- headStream
            if isPaired r1
              then tryHead >>= go2 r1
              else case trim_adapter r1 fwd_ads of
                    Nothing                -> return [r1]
                    Just (r1',r1t,_q1,_q2) -> return [r1t,r1']

    go2 r1  Nothing  = error $ "Lone mate found: " ++ show (b_qname r1)
    go2 r1 (Just r2) = case merge_overlap r1 fwd_ads r2 rev_ads of
                    Nothing                   -> return [r1,r2]
                    Just (r1',r2',rm,_q1,_q2) -> return [rm,r1',r2']