module Bio.Bam.Rmdup(
rmdup, Collapse, cons_collapse, cheap_collapse,
cons_collapse_keep, cheap_collapse_keep,
check_sort, normalizeTo, wrapTo,
ECig(..), toECig, setMD, toCigar
) where
import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Iteratee
import Bio.Prelude hiding ( left, right )
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as T
import qualified Data.Map.Strict as M
import qualified Data.Vector.Generic as V
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Unboxed as U
data Collapse = Collapse {
collapse :: [BamRec] -> (Decision,[BamRec]),
originals :: [BamRec] -> [BamRec] }
data Decision = Consensus { fromDecision :: BamRec }
| Representative { fromDecision :: BamRec }
cons_collapse :: Qual -> Collapse
cons_collapse maxq = Collapse (do_collapse maxq) (const [])
cons_collapse_keep :: Qual -> Collapse
cons_collapse_keep maxq = Collapse (do_collapse maxq) (map (\b -> b { b_flag = b_flag b .|. flagDuplicate }))
cheap_collapse :: Collapse
cheap_collapse = Collapse do_cheap_collapse (const [])
cheap_collapse_keep :: Collapse
cheap_collapse_keep = Collapse do_cheap_collapse (map (\b -> b { b_flag = b_flag b .|. flagDuplicate }))
rmdup :: (Monad m, Ord l) => (BamRec -> l) -> Bool -> Collapse -> Enumeratee [BamRec] [(Int,BamRec)] m r
rmdup label strand_preserved collapse_cfg =
check_sort id "input must be sorted for rmdup to work" ><>
mapGroups rmdup_group ><>
check_sort snd "internal error, output isn't sorted anymore"
where
rmdup_group = nice_sort . do_rmdup label strand_preserved collapse_cfg
same_pos u v = b_cpos u == b_cpos v
b_cpos u = (b_rname u, b_pos u)
nice_sort = sortBy $ comparing (V.length . b_seq . snd)
mapGroups f o = tryHead >>= maybe (return o) (\a -> eneeCheckIfDone (mg1 f a []) o)
mg1 f a acc k = tryHead >>= \case
Nothing -> return . k . Chunk . f $ a:acc
Just b | same_pos a b -> mg1 f a (b:acc) k
| otherwise -> eneeCheckIfDone (mg1 f b []) . k . Chunk . f $ a:acc
check_sort :: Monad m => (a -> BamRec) -> String -> Enumeratee [a] [a] m b
check_sort f msg out = tryHead >>= maybe (return out) (\a -> eneeCheckIfDone (step a) out)
where
step a k = tryHead >>= maybe (return . k $ Chunk [a]) (step' a k)
step' a k b | (b_rname (f a), b_pos (f a)) > (b_rname (f b), b_pos (f b)) = fail $ "rmdup: " ++ msg
| otherwise = eneeCheckIfDone (step b) . k $ Chunk [a]
do_rmdup :: Ord l => (BamRec -> l) -> Bool -> Collapse -> [BamRec] -> [(Int,BamRec)]
do_rmdup label strand_preserved Collapse{..} =
concatMap do_rmdup1 . M.elems . accumMap label id
where
do_rmdup1 rds = results ++ map ((,) 0) (originals (leftovers ++ r1 ++ r2 ++ r3))
where
(results, leftovers) = merge_singles singles' unaligned' $
[ (str, second fromDecision b) | ((_,str ),b) <- M.toList merged' ] ++
[ (str, second fromDecision b) | ((_,str,_),b) <- M.toList pairs' ]
(raw_pairs, raw_singles) = partition isPaired rds
(merged, true_singles) = partition (liftA2 (||) isMerged isTrimmed) raw_singles
(pairs, raw_half_pairs) = partition b_totally_aligned raw_pairs
(half_unaligned, half_aligned) = partition isUnmapped raw_half_pairs
mkMap :: Ord a => (BamRec -> a) -> [BamRec] -> (M.Map a (Int,Decision), [BamRec])
mkMap f x = let m1 = M.map (length &&& collapse) $ accumMap f id x
in (M.map (second fst) m1, concatMap (snd.snd) $ M.elems m1)
(pairs',r1) = mkMap (\b -> (b_mate_pos b, b_strand b, b_mate b)) pairs
(merged',r2) = mkMap (\b -> (alignedLength (b_cigar b), b_strand b)) merged
(singles',r3) = mkMap b_strand (true_singles++half_aligned)
unaligned' = accumMap b_strand id half_unaligned
b_strand b = strand_preserved && isReversed b
b_mate b = strand_preserved && isFirstMate b
merge_singles :: M.Map Bool (Int,Decision)
-> M.Map Bool [BamRec]
-> [ (Bool, (Int, BamRec)) ]
-> ([(Int,BamRec)],[BamRec])
merge_singles singles unaligneds = go
where
go ( (str, (m,v)) : paireds) =
let (r,l) = merge_singles (M.delete str singles) (M.delete str unaligneds) paireds
unal = M.findWithDefault [] str unaligneds ++ l
in case M.lookup str singles of
Nothing -> ( (m,v) : r, unal )
Just (n, Consensus w) -> ( (n, add_xp_of w v) : r, unal )
Just (n, Representative w) -> ( (n, add_xp_of w v) : r, w : unal )
go [] = merge_halves unaligneds (M.toList singles)
add_xp_of w v = v { b_exts = updateE "XP" (Int $ extAsInt 1 "XP" w `oplus` extAsInt 1 "XP" v) (b_exts v) }
merge_halves :: M.Map Bool [BamRec]
-> [(Bool, (Int,Decision))]
-> ([(Int,BamRec)],[BamRec])
merge_halves unaligneds ((_, (n, Consensus v)) : singles) = ( (n, v { b_flag = b_flag v .&. complement pflags }) : r, l )
where
(r,l) = merge_halves unaligneds singles
pflags = flagPaired .|. flagProperlyPaired .|. flagMateUnmapped .|. flagMateReversed .|. flagFirstMate .|. flagSecondMate
merge_halves unaligneds ((str, (n, Representative v)) : singles) = ((n,v) : map ((,)1) (take 1 same) ++ r, drop 1 same ++ diff ++ l)
where
(r,l) = merge_halves (M.delete str unaligneds) singles
(same,diff) = partition (is_mate_of v) $ M.findWithDefault [] str unaligneds
is_mate_of a b = b_qname a == b_qname b && isPaired a && isPaired b && isFirstMate a == isSecondMate b
merge_halves unaligneds [] = ( [], concat $ M.elems unaligneds )
type MPos = (Refseq, Int, Bool, Bool)
b_mate_pos :: BamRec -> MPos
b_mate_pos b = (b_mrnm b, b_mpos b, isUnmapped b, isMateUnmapped b)
b_totally_aligned :: BamRec -> Bool
b_totally_aligned b = not (isUnmapped b || isMateUnmapped b)
accumMap :: Ord k => (a -> k) -> (a -> v) -> [a] -> M.Map k [v]
accumMap f g = go M.empty
where
go m [ ] = m
go m (a:as) = let ws = M.findWithDefault [] (f a) m ; g' = g a
in g' `seq` go (M.insert (f a) (g':ws) m) as
do_collapse :: Qual -> [BamRec] -> (Decision, [BamRec])
do_collapse maxq [br] | V.all (<= maxq) (b_qual br) = ( Representative br, [ ] )
| otherwise = ( Consensus lq_br, [br] )
where
lq_br = br { b_qual = V.map (min maxq) $ b_qual br
, b_virtual_offset = 0
, b_qname = b_qname br `B.snoc` c2w 'c' }
do_collapse maxq brs = ( Consensus b0 { b_exts = modify_extensions $ b_exts b0
, b_flag = failflag .&. complement flagDuplicate
, b_mapq = Q $ rmsq $ map (unQ . b_mapq) $ good brs
, b_cigar = cigar'
, b_seq = V.fromList $ map fst cons_seq_qual
, b_qual = V.fromList $ map snd cons_seq_qual
, b_qname = b_qname b0 `B.snoc` 99
, b_virtual_offset = 0 }, brs )
where
!b0 = minimumBy (comparing b_qname) brs
!most_fail = 2 * length (filter isFailsQC brs) > length brs
!failflag | most_fail = b_flag b0 .|. flagFailsQC
| otherwise = b_flag b0 .&. complement flagFailsQC
rmsq xs = case foldl' (\(!n,!d) x -> (n + fromIntegral x * fromIntegral x, d + 1)) (0,0) xs of
(!n,!d) -> round $ sqrt $ (n::Double) / fromIntegral (d::Int)
maj xs = head . maximumBy (comparing length) . group . sort $ xs
nub' = concatMap head . group . sort
!cigar' = maj $ map b_cigar brs
good = filter ((==) cigar' . b_cigar)
cons_seq_qual = [ consensus maxq [ (V.unsafeIndex (b_seq b) i, q)
| b <- good brs, let q = if V.null (b_qual b) then Q 23 else b_qual b V.! i ]
| i <- [0 .. len - 1] ]
where !len = V.length . b_seq . head $ good brs
md' = case [ (b_seq b,md,b) | b <- good brs, Just md <- [ getMd b ] ] of
[ ] -> []
(seq1, md1,b) : _ -> case mk_new_md' [] (V.toList cigar') md1 (V.toList seq1) (map fst cons_seq_qual) of
Right x -> x
Left (MdFail cigs ms osq nsq) -> error $ unlines
[ "Broken MD field when trying to construct new MD!"
, "QNAME: " ++ show (b_qname b)
, "POS: " ++ shows (unRefseq (b_rname b)) ":" ++ show (b_pos b)
, "CIGAR: " ++ show cigs
, "MD: " ++ show ms
, "refseq: " ++ show osq
, "readseq: " ++ show nsq ]
nm' = sum $ [ n | Ins :* n <- VS.toList cigar' ] ++ [ n | Del :* n <- VS.toList cigar' ] ++ [ 1 | MdRep _ <- md' ]
xa' = nub' [ T.split ';' xas | Just (Text xas) <- map (lookup "XA" . b_exts) brs ]
modify_extensions es = foldr ($!) es $
[ let vs = mapMaybe (lookup k . b_exts) brs
in if null vs then id else updateE k $! maj vs | k <- do_maj ] ++
[ let vs = [ v | Just (Int v) <- map (lookup k . b_exts) brs ]
in if null vs then id else updateE k $! Int (rmsq vs) | k <- do_rmsq ] ++
map deleteE useless ++
[ updateE "NM" $! Int nm'
, updateE "XP" $! Int (foldl' (\a b -> a `oplus` extAsInt 1 "XP" b) 0 brs)
, if null xa' then id else updateE "XA" $! (Text $ T.intercalate (T.singleton ';') xa')
, if null md' then id else updateE "MD" $! (Text $ showMd md') ]
useless = map fromString $ words "BQ CM FZ Q2 R2 XM XO XG YQ EN CQ CS E2 FS OQ OP OC U2 H0 H1 H2 HI NH IH ZQ"
do_rmsq = map fromString $ words "AM AS MQ PQ SM UQ"
do_maj = map fromString $ words "X0 X1 XT XS XF XE BC LB RG XI XJ YI YJ"
minViewBy :: (a -> a -> Ordering) -> [a] -> (a,[a])
minViewBy _ [ ] = error "minViewBy on empty list"
minViewBy cmp (x:xs) = go x [] xs
where
go m acc [ ] = (m,acc)
go m acc (a:as) = case m `cmp` a of GT -> go a (m:acc) as
_ -> go m (a:acc) as
data MdFail = MdFail [Cigar] [MdOp] [Nucleotides] [Nucleotides]
mk_new_md' :: [MdOp] -> [Cigar] -> [MdOp] -> [Nucleotides] -> [Nucleotides] -> Either MdFail [MdOp]
mk_new_md' acc [] [] [] [] = Right $ normalize [] acc
where
normalize a (MdNum 0:os) = normalize a os
normalize (MdNum n:a) (MdNum m:os) = normalize (MdNum (n+m):a) os
normalize a (MdDel []:os) = normalize a os
normalize (MdDel u:a) (MdDel v:os) = normalize (MdDel (v++u):a) os
normalize a ( o:os) = normalize (o:a) os
normalize a [ ] = a
mk_new_md' acc ( _ :* 0 : cigs) mds osq nsq = mk_new_md' acc cigs mds osq nsq
mk_new_md' acc cigs (MdNum 0 : mds) osq nsq = mk_new_md' acc cigs mds osq nsq
mk_new_md' acc cigs (MdDel [] : mds) osq nsq = mk_new_md' acc cigs mds osq nsq
mk_new_md' acc (Mat :* u : cigs) (MdRep b : mds) (_:osq) (n:nsq)
| b == n = mk_new_md' (MdNum 1 : acc) (Mat :* (u-1):cigs) mds osq nsq
| otherwise = mk_new_md' (MdRep b : acc) (Mat :* (u-1):cigs) mds osq nsq
mk_new_md' acc (Mat :* u : cigs) (MdNum v : mds) (o:osq) (n:nsq)
| o == n = mk_new_md' (MdNum 1 : acc) (Mat :* (u-1):cigs) (MdNum (v-1) : mds) osq nsq
| otherwise = mk_new_md' (MdRep o : acc) (Mat :* (u-1):cigs) (MdNum (v-1) : mds) osq nsq
mk_new_md' acc (Del :* n : cigs) (MdDel bs : mds) osq nsq | n == length bs = mk_new_md' (MdDel bs : acc) cigs mds osq nsq
mk_new_md' acc (Del :* n : cigs) (MdDel (b:bs) : mds) osq nsq = mk_new_md' (MdDel [b] : acc) (Del :* (n-1) : cigs) (MdDel bs:mds) osq nsq
mk_new_md' acc (Del :* n : cigs) (MdRep b : mds) osq nsq = mk_new_md' (MdDel [b] : acc) (Del :* (n-1) : cigs) mds osq nsq
mk_new_md' acc (Del :* n : cigs) (MdNum m : mds) osq nsq = mk_new_md' (MdDel [nucsN] : acc) (Del :* (n-1) : cigs) (MdNum (m-1):mds) osq nsq
mk_new_md' acc (Ins :* n : cigs) md osq nsq = mk_new_md' acc cigs md (drop n osq) (drop n nsq)
mk_new_md' acc (SMa :* n : cigs) md osq nsq = mk_new_md' acc cigs md (drop n osq) (drop n nsq)
mk_new_md' acc (HMa :* _ : cigs) md osq nsq = mk_new_md' acc cigs md osq nsq
mk_new_md' acc (Pad :* _ : cigs) md osq nsq = mk_new_md' acc cigs md osq nsq
mk_new_md' acc (Nop :* _ : cigs) md osq nsq = mk_new_md' acc cigs md osq nsq
mk_new_md' _acc cigs ms osq nsq = Left $ MdFail cigs ms osq nsq
consensus :: Qual -> [ (Nucleotides, Qual) ] -> (Nucleotides, Qual)
consensus (Q maxq) nqs = if qr > 3 then (n0, Q qr) else (nucsN, Q 0)
where
accs :: U.Vector Int
accs = U.accum (+) (U.replicate 16 0) [ (fromIntegral n, fromIntegral q) | (Ns n,Q q) <- nqs ]
(n0,q0) : (_,q1) : _ = sortBy (flip $ comparing snd) $ zip [Ns 0 ..] $ U.toList accs
qr = fromIntegral $ (q0-q1) `min` fromIntegral maxq
do_cheap_collapse :: [BamRec] -> ( Decision, [BamRec] )
do_cheap_collapse [b] = ( Representative b, [] )
do_cheap_collapse bs = ( Representative $ replaceXP new_xp b0, bx )
where
(b0, bx) = minViewBy (comparing b_qname) bs
new_xp = foldl' (\a b -> a `oplus` extAsInt 1 "XP" b) 0 bs
replaceXP :: Int -> BamRec -> BamRec
replaceXP new_xp b0 = b0 { b_exts = updateE "XP" (Int new_xp) $ b_exts b0 }
oplus :: Int -> Int -> Int
_ `oplus` (-1) = -1
(-1) `oplus` _ = -1
a `oplus` b = a + b
normalizeTo :: Seqid -> Int -> BamRec -> Either BamRec BamRec
normalizeTo nm l b = lr $ b { b_pos = b_pos b `mod` l
, b_mpos = b_mpos b `mod` l
, b_mapq = if dups_are_fine then Q 37 else b_mapq b
, b_exts = if dups_are_fine then deleteE "XA" (b_exts b) else b_exts b }
where
lr = if b_pos b >= l then Left else Right
dups_are_fine = all_match_XA (extAsString "XA" b)
all_match_XA s = case T.split ';' s of [xa1, xa2] | T.null xa2 -> one_match_XA xa1
[xa1] -> one_match_XA xa1
_ -> False
one_match_XA s = case T.split ',' s of (sq:pos:_) | sq == nm -> pos_match_XA pos ; _ -> False
pos_match_XA s = case T.readInt s of Just (p,z) | T.null z -> int_match_XA p ; _ -> False
int_match_XA p | p >= 0 = (p-1) `mod` l == b_pos b `mod` l && not (isReversed b)
| otherwise = (-p-1) `mod` l == b_pos b `mod` l && isReversed b
wrapTo :: Int -> BamRec -> [Either BamRec BamRec]
wrapTo l b = if overhangs then do_wrap else [Right b]
where
overhangs = not (isUnmapped b) && b_pos b < l && l < b_pos b + alignedLength (b_cigar b)
do_wrap = case split_ecig (l - b_pos b) $ toECig (b_cigar b) (fromMaybe [] $ getMd b) of
(left,right) -> [ Right $ b { b_cigar = toCigar left } `setMD` left
, Left $ b { b_cigar = toCigar right, b_pos = 0 } `setMD` right ]
split_ecig :: Int -> ECig -> (ECig, ECig)
split_ecig _ WithMD = (WithMD, WithMD)
split_ecig _ WithoutMD = (WithoutMD, WithoutMD)
split_ecig 0 ecs = (mask_all ecs, ecs)
split_ecig i (Ins' n ecs) = case split_ecig i ecs of (u,v) -> (Ins' n u, SMa' n v)
split_ecig i (SMa' n ecs) = case split_ecig i ecs of (u,v) -> (SMa' n u, SMa' n v)
split_ecig i (HMa' n ecs) = case split_ecig i ecs of (u,v) -> (HMa' n u, HMa' n v)
split_ecig i (Pad' n ecs) = case split_ecig i ecs of (u,v) -> (Pad' n u, v)
split_ecig i (Mat' n ecs)
| i >= n = case split_ecig (i-n) ecs of (u,v) -> (Mat' n u, SMa' n v)
| otherwise = (Mat' i $ SMa' (n-i) $ mask_all ecs, SMa' i $ Mat' (n-i) ecs)
split_ecig i (Rep' x ecs) = case split_ecig (i-1) ecs of (u,v) -> (Rep' x u, SMa' 1 v)
split_ecig i (Del' x ecs) = case split_ecig (i-1) ecs of (u,v) -> (Del' x u, v)
split_ecig i (Nop' n ecs)
| i >= n = case split_ecig (i-n) ecs of (u,v) -> (Nop' n u, v)
| otherwise = (Nop' i $ mask_all ecs, Nop' (n-i) ecs)
mask_all :: ECig -> ECig
mask_all WithMD = WithMD
mask_all WithoutMD = WithoutMD
mask_all (Nop' _ ec) = mask_all ec
mask_all (HMa' _ ec) = mask_all ec
mask_all (Pad' _ ec) = mask_all ec
mask_all (Del' _ ec) = mask_all ec
mask_all (Rep' _ ec) = SMa' 1 $ mask_all ec
mask_all (Mat' n ec) = SMa' n $ mask_all ec
mask_all (Ins' n ec) = SMa' n $ mask_all ec
mask_all (SMa' n ec) = SMa' n $ mask_all ec
data ECig = WithMD
| WithoutMD
| Mat' Int ECig
| Rep' Nucleotides ECig
| Ins' Int ECig
| Del' Nucleotides ECig
| Nop' Int ECig
| SMa' Int ECig
| HMa' Int ECig
| Pad' Int ECig
toECig :: VS.Vector Cigar -> [MdOp] -> ECig
toECig = go . VS.toList
where
go cs (MdNum 0:mds) = go cs mds
go cs (MdDel []:mds) = go cs mds
go (_:*0 :cs) mds = go cs mds
go [ ] [ ] = WithMD
go [ ] _ = WithoutMD
go (Mat :* n : cs) (MdRep x:mds) = Rep' x $ go (Mat :* (n-1) : cs) mds
go (Mat :* n : cs) (MdNum m:mds)
| n < m = Mat' n $ go cs (MdNum (m-n):mds)
| n > m = Mat' m $ go (Mat :* (n-m) : cs) mds
| otherwise = Mat' n $ go cs mds
go (Mat :* n : cs) _ = Mat' n $ go' cs
go (Ins :* n : cs) mds = Ins' n $ go cs mds
go (Del :* n : cs) (MdDel (x:xs):mds) = Del' x $ go (Del :* (n-1) : cs) (MdDel xs:mds)
go (Del :* n : cs) _ = Del' nucsN $ go' (Del :* (n-1) : cs)
go (Nop :* n : cs) mds = Nop' n $ go cs mds
go (SMa :* n : cs) mds = SMa' n $ go cs mds
go (HMa :* n : cs) mds = HMa' n $ go cs mds
go (Pad :* n : cs) mds = Pad' n $ go cs mds
go' (_ :* 0 : cs) = go' cs
go' [ ] = WithoutMD
go' (Mat :* n : cs) = Mat' n $ go' cs
go' (Ins :* n : cs) = Ins' n $ go' cs
go' (Del :* n : cs) = Del' nucsN $ go' (Del :* (n-1) : cs)
go' (Nop :* n : cs) = Nop' n $ go' cs
go' (SMa :* n : cs) = SMa' n $ go' cs
go' (HMa :* n : cs) = HMa' n $ go' cs
go' (Pad :* n : cs) = Pad' n $ go' cs
toCigar :: ECig -> VS.Vector Cigar
toCigar = V.fromList . go
where
go WithMD = []
go WithoutMD = []
go (Ins' n ecs) = Ins :* n : go ecs
go (Nop' n ecs) = Nop :* n : go ecs
go (HMa' n ecs) = HMa :* n : go ecs
go (Pad' n ecs) = Pad :* n : go ecs
go (SMa' n ecs) = go_sma n ecs
go (Mat' n ecs) = go_mat n ecs
go (Rep' _ ecs) = go_mat 1 ecs
go (Del' _ ecs) = go_del 1 ecs
go_sma !n (SMa' m ecs) = go_sma (n+m) ecs
go_sma !n ecs = SMa :* n : go ecs
go_mat !n (Mat' m ecs) = go_mat (n+m) ecs
go_mat !n (Rep' _ ecs) = go_mat (n+1) ecs
go_mat !n ecs = Mat :* n : go ecs
go_del !n (Del' _ ecs) = go_del (n+1) ecs
go_del !n ecs = Del :* n : go ecs
setMD :: BamRec -> ECig -> BamRec
setMD b ec = case go ec of
Just md -> b { b_exts = updateE "MD" (Text $ showMd md) (b_exts b) }
Nothing -> b { b_exts = deleteE "MD" (b_exts b) }
where
go WithMD = Just []
go WithoutMD = Nothing
go (Ins' _ ecs) = go ecs
go (Nop' _ ecs) = go ecs
go (SMa' _ ecs) = go ecs
go (HMa' _ ecs) = go ecs
go (Pad' _ ecs) = go ecs
go (Mat' n ecs) = (if n == 0 then id else fmap (addNum n)) $ go ecs
go (Rep' x ecs) = (if isGap x then id else fmap (addRep x)) $ go ecs
go (Del' x ecs) = (if isGap x then id else fmap (addDel x)) $ go ecs
addNum n (MdNum m : mds) = MdNum (n+m) : mds
addNum n mds = MdNum n : mds
addRep x mds = MdRep x : mds
addDel x (MdDel y : mds) = MdDel (x:y) : mds
addDel x mds = MdDel [x] : mds