module Bio.Bam.Trim where
import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Bam.Rmdup ( ECig(..), setMD, toECig )
import Bio.Iteratee
import Bio.Prelude
import Foreign.C.Types ( CInt(..) )
import Foreign.Ptr ( Ptr )
import qualified Data.ByteString as B
import qualified Data.Vector.Generic as V
import qualified Data.Vector.Storable as W
trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec
trim_3' p b | b_flag b `testBit` 4 = trim_rev
| otherwise = trim_fwd
where
trim_fwd = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $
zip (inits . reverse . V.toList $ b_seq b)
(inits . reverse . V.toList $ b_qual b)
in trim_3 l b
trim_rev = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $
zip (inits . V.toList $ b_seq b)
(inits . V.toList $ b_qual b)
in trim_3 l b
trim_3 :: Int -> BamRec -> BamRec
trim_3 l b | b_flag b `testBit` 4 = trim_rev
| otherwise = trim_fwd
where
trim_fwd = let (_, cigar') = trim_back_cigar (b_cigar b) l
c = modMd (takeECig (V.length (b_seq b) l)) b
in c { b_seq = V.take (V.length (b_seq c) l) (b_seq c)
, b_qual = V.take (V.length (b_qual c) l) (b_qual c)
, b_cigar = cigar'
, b_exts = map (\(k,e) -> case e of
Text t | k `elem` trim_set
-> (k, Text (B.take (B.length t l) t))
_ -> (k,e)
) (b_exts c) }
trim_rev = let (off, cigar') = trim_fwd_cigar (b_cigar b) l
c = modMd (dropECig l) b
in c { b_seq = V.drop l (b_seq c)
, b_qual = V.drop l (b_qual c)
, b_pos = b_pos c + off
, b_cigar = cigar'
, b_exts = map (\(k,e) -> case e of
Text t | k `elem` trim_set
-> (k, Text (B.drop l t))
_ -> (k,e)
) (b_exts c) }
trim_set = ["BQ","CQ","CS","E2","OQ","U2"]
modMd :: (ECig -> ECig) -> BamRec -> BamRec
modMd f br = maybe br (setMD br . f . toECig (b_cigar br)) (getMd br)
endOf :: ECig -> ECig
endOf WithMD = WithMD
endOf WithoutMD = WithoutMD
endOf (Mat' _ es) = endOf es
endOf (Ins' _ es) = endOf es
endOf (SMa' _ es) = endOf es
endOf (Rep' _ es) = endOf es
endOf (Del' _ es) = endOf es
endOf (Nop' _ es) = endOf es
endOf (HMa' _ es) = endOf es
endOf (Pad' _ es) = endOf es
takeECig :: Int -> ECig -> ECig
takeECig 0 es = endOf es
takeECig _ WithMD = WithMD
takeECig _ WithoutMD = WithoutMD
takeECig n (Mat' m es) = Mat' n $ if n > m then takeECig (nm) es else WithMD
takeECig n (Ins' m es) = Ins' n $ if n > m then takeECig (nm) es else WithMD
takeECig n (SMa' m es) = SMa' n $ if n > m then takeECig (nm) es else WithMD
takeECig n (Rep' ns es) = Rep' ns $ takeECig (n1) es
takeECig n (Del' ns es) = Del' ns $ takeECig n es
takeECig n (Nop' m es) = Nop' m $ takeECig n es
takeECig n (HMa' m es) = HMa' m $ takeECig n es
takeECig n (Pad' m es) = Pad' m $ takeECig n es
dropECig :: Int -> ECig -> ECig
dropECig 0 es = es
dropECig _ WithMD = WithMD
dropECig _ WithoutMD = WithoutMD
dropECig n (Mat' m es) = if n > m then dropECig (nm) es else Mat' n WithMD
dropECig n (Ins' m es) = if n > m then dropECig (nm) es else Ins' n WithMD
dropECig n (SMa' m es) = if n > m then dropECig (nm) es else SMa' n WithMD
dropECig n (Rep' _ es) = dropECig (n1) es
dropECig n (Del' _ es) = dropECig n es
dropECig n (Nop' _ es) = dropECig n es
dropECig n (HMa' _ es) = dropECig n es
dropECig n (Pad' _ es) = dropECig n es
trim_back_cigar, trim_fwd_cigar :: V.Vector v Cigar => v Cigar -> Int -> ( Int, v Cigar )
trim_back_cigar c l = (o, V.fromList $ reverse c') where (o,c') = sanitize_cigar . trim_cigar l $ reverse $ V.toList c
trim_fwd_cigar c l = (o, V.fromList c') where (o,c') = sanitize_cigar $ trim_cigar l $ V.toList c
sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar (o, [ ]) = (o, [])
sanitize_cigar (o, (op:*l):xs) | op == Pad = sanitize_cigar (o,xs)
| op == Del || op == Nop = sanitize_cigar (o + l, xs)
| op == Ins = (o, (SMa :* l):xs)
| otherwise = (o, (op :* l):xs)
trim_cigar :: Int -> [Cigar] -> (Int, [Cigar])
trim_cigar 0 cs = (0, cs)
trim_cigar _ [] = (0, [])
trim_cigar l ((op:*ll):cs) | bad_op op = let (o,cs') = trim_cigar l cs in (o + reflen op ll, cs')
| otherwise = case l `compare` ll of
LT -> (reflen op l, (op :* (lll)):cs)
EQ -> (reflen op ll, cs)
GT -> let (o,cs') = trim_cigar (l ll) cs in (o + reflen op ll, cs')
where
reflen op' = if ref_op op' then id else const 0
bad_op o = o /= Mat && o /= Ins && o /= SMa
ref_op o = o == Mat || o == Del
trim_low_quality :: Qual -> a -> [Qual] -> Bool
trim_low_quality q = const $ all (< q)
merge_overlap :: BamRec -> [ W.Vector Nucleotides ]
-> BamRec -> [ W.Vector Nucleotides ]
-> Maybe ( BamRec, BamRec, BamRec, Int, Int )
merge_overlap r1 ads1 r2 ads2
| V.null (b_seq r1) && V.null (b_seq r2) = Nothing
| otherwise = result mlen score1 score2
where
plain_score = 6 * fromIntegral (len_r1 + len_r2)
len_r1 = V.length $ b_seq r1
len_r2 = V.length $ b_seq r2
b_seq_r1 = V.convert $ b_seq r1
b_seq_r2 = V.convert $ b_seq r2
b_qual_r1 = V.convert $ b_qual r1
b_qual_r2 = V.convert $ b_qual r2
(score1, mlen, score2) = twoMins plain_score (len_r1 + len_r2) $
merge_score ads1 ads2 b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2
flag_vestigial br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagVestigial) $ b_exts br }
store_quals s1 s2 br = br { b_exts = updateE "YM" (Int $ s2 s1) $
updateE "YN" (Int $ plain_score s1) $ b_exts br }
result l s1 s2 = Just ( store_quals s1 s2 $ flag_vestigial r1
, store_quals s1 s2 $ flag_vestigial r2
, store_quals s1 s2 $ merged_read l (fromIntegral . min 63 $ s2s1)
, s2 s1, plain_score s1 )
merged_read l qmax
| V.length merged_seq /= l = error $ "Logic error in merged_read: " ++ show (V.length merged_seq, l)
| otherwise = nullBamRec {
b_qname = b_qname r1,
b_flag = flagUnmapped .|. complement pair_flags .&. b_flag r1,
b_seq = merged_seq,
b_qual = merged_qual,
b_exts = let ff = if l < len_r1 then eflagTrimmed else 0
in updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagMerged .|. ff) $ b_exts r1 }
where
merged_seq = V.convert $ V.concat
[ V.take (l len_r2) (b_seq_r1)
, merge_seqs (V.take l $ V.drop (l len_r2) b_seq_r1)
(V.take l $ V.drop (l len_r2) b_qual_r1)
(V.reverse $ V.take l $ V.drop (l len_r1) b_seq_r2)
(V.reverse $ V.take l $ V.drop (l len_r1) b_qual_r2)
, V.reverse $ V.take (l len_r1) b_seq_r2 ]
merged_qual = V.convert $ V.concat
[ V.take (l len_r2) (b_qual_r1)
, merge_quals qmax (V.take l $ V.drop (l len_r2) b_seq_r1)
(V.take l $ V.drop (l len_r2) b_qual_r1)
(V.reverse $ V.take l $ V.drop (l len_r1) b_seq_r2)
(V.reverse $ V.take l $ V.drop (l len_r1) b_qual_r2)
, V.reverse $ V.take (l len_r1) b_qual_r2 ]
pair_flags = flagPaired.|.flagProperlyPaired.|.flagMateUnmapped.|.flagMateReversed.|.flagFirstMate.|.flagSecondMate
merge_seqs v1 v2 v3 v4 = V.zipWith4 zz v1 v2 v3 v4
where
zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = n1
| q1 > q2 = n1
| otherwise = compls n2
merge_quals qmax v1 v2 v3 v4 = V.zipWith4 zz v1 v2 v3 v4
where
zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = Q $ min qmax (q1 + q2)
| q1 > q2 = Q $ q1 q2
| otherwise = Q $ q2 q1
trim_adapter :: BamRec -> [ W.Vector Nucleotides ] -> Maybe ( BamRec, BamRec, Int, Int )
trim_adapter r1 ads1
| V.null (b_seq r1) = Nothing
| otherwise = result mlen score1 score2
where
plain_score = 6 * fromIntegral (V.length (b_seq r1))
b_seq_r1 = V.convert $ b_seq r1
b_qual_r1 = V.convert $ b_qual r1
(score1, mlen, score2) = twoMins plain_score (V.length (b_seq r1)) $
merge_score ads1 [V.empty] b_seq_r1 b_qual_r1 V.empty V.empty
flag_vestigial br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagVestigial) $ b_exts br }
store_quals s1 s2 br = br { b_exts = updateE "YM" (Int $ s2 s1) $
updateE "YN" (Int $ plain_score s1) $ b_exts br }
result l s1 s2 = Just ( store_quals s1 s2 $ flag_vestigial r1
, store_quals s1 s2 $ trimmed_read l
, s2 s1, plain_score s1 )
trimmed_read l = nullBamRec {
b_qname = b_qname r1,
b_flag = flagUnmapped .|. b_flag r1,
b_seq = V.take l $ b_seq r1,
b_qual = V.take l $ b_qual r1,
b_exts = updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagTrimmed) $ b_exts r1 }
default_fwd_adapters :: [ W.Vector Nucleotides ]
default_fwd_adapters = map (W.fromList. map toNucleotides)
[ "AGATCGGAAGAGCGGTTCAG"
, "AGATCGGAAGAGCACACGTC"
, "AGATCGGAAGAGCTCGTATG" ]
default_rev_adapters :: [ W.Vector Nucleotides ]
default_rev_adapters = map (W.fromList. map toNucleotides)
[ "AGATCGGAAGAGCGTCGTGT"
, "GGAAGAGCGTCGTGTAGGGA" ]
merge_score
:: [ W.Vector Nucleotides ]
-> [ W.Vector Nucleotides ]
-> W.Vector Nucleotides -> W.Vector Qual
-> W.Vector Nucleotides -> W.Vector Qual
-> Int
-> Int
merge_score fwd_adapters rev_adapters !read1 !qual1 !read2 !qual2 !l
= 6 * fromIntegral (l `min` V.length read1)
+ 6 * fromIntegral (max 0 (l V.length read1))
+ foldl' (\acc fwd_ad -> min acc
(match_adapter l read1 qual1 fwd_ad +
6 * fromIntegral (max 0 (V.length read1 V.length fwd_ad l)))
) maxBound fwd_adapters
+ foldl' (\acc rev_ad -> min acc
(match_adapter l read2 qual2 rev_ad +
6 * fromIntegral (max 0 (V.length read2 V.length rev_ad l)))
) maxBound rev_adapters
+ match_reads l read1 qual1 read2 qual2
match_adapter :: Int -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> Int
match_adapter !off !rd !qs !ad
| V.length rd /= V.length qs = error "read/qual length mismatch"
| efflength <= 0 = 0
| otherwise
= fromIntegral . unsafePerformIO $
W.unsafeWith rd $ \p_rd ->
W.unsafeWith qs $ \p_qs ->
W.unsafeWith ad $ \p_ad ->
prim_match_ad (fromIntegral off)
(fromIntegral efflength)
p_rd p_qs p_ad
where
!efflength = (V.length rd off) `min` V.length ad
foreign import ccall unsafe "prim_match_ad"
prim_match_ad :: CInt -> CInt
-> Ptr Nucleotides -> Ptr Qual
-> Ptr Nucleotides -> IO CInt
match_reads :: Int -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> W.Vector Qual -> Int
match_reads !l !rd1 !qs1 !rd2 !qs2
| V.length rd1 /= V.length qs1 || V.length rd2 /= V.length qs2 = error "read/qual length mismatch"
| efflength <= 0 = 0
| otherwise
= fromIntegral . unsafePerformIO $
W.unsafeWith rd1 $ \p_rd1 ->
W.unsafeWith qs1 $ \p_qs1 ->
W.unsafeWith rd2 $ \p_rd2 ->
W.unsafeWith qs2 $ \p_qs2 ->
prim_match_reads (fromIntegral minidx1)
(fromIntegral maxidx2)
(fromIntegral efflength)
p_rd1 p_qs1 p_rd2 p_qs2
where
!minidx1 = (l V.length rd2) `max` 0
!maxidx2 = l `min` V.length rd2
!efflength = ((V.length rd1 + V.length rd2 l) `min` l) `max` 0
foreign import ccall unsafe "prim_match_reads"
prim_match_reads :: CInt -> CInt -> CInt
-> Ptr Nucleotides -> Ptr Qual
-> Ptr Nucleotides -> Ptr Qual -> IO CInt
twoMins :: (Bounded a, Ord a) => a -> Int -> (Int -> a) -> (a,Int,a)
twoMins a0 imax f = go a0 0 maxBound 0 0
where
go !m1 !i1 !m2 !i2 !i
| i == imax = (m1,i1,m2)
| otherwise =
case f i of
x | x < m1 -> go x i m1 i1 (i+1)
| x < m2 -> go m1 i1 x i (i+1)
| otherwise -> go m1 i1 m2 i2 (i+1)
mergeTrimBam :: Monad m => [W.Vector Nucleotides] -> [W.Vector Nucleotides] -> Enumeratee [BamRec] [BamRec] m a
mergeTrimBam fwd_ads rev_ads = convStream go
where
go = do r1 <- headStream
if isPaired r1
then tryHead >>= go2 r1
else case trim_adapter r1 fwd_ads of
Nothing -> return [r1]
Just (r1',r1t,_q1,_q2) -> return [r1t,r1']
go2 r1 Nothing = error $ "Lone mate found: " ++ show (b_qname r1)
go2 r1 (Just r2) = case merge_overlap r1 fwd_ads r2 rev_ads of
Nothing -> return [r1,r2]
Just (r1',r2',rm,_q1,_q2) -> return [rm,r1',r2']