module Bio.Adna (
DmgStats(..),
CompositionStats,
SubstitutionStats,
addFragType,
damagePatternsIter,
damagePatternsIterMD,
damagePatternsIter2Bit,
alnFromMd,
DamageParameters(..),
NewDamageParameters(..),
GenDamageParameters(..),
DamageModel,
bang, nudge,
Alignment(..),
FragType(..),
Subst(..),
NPair,
noDamage,
univDamage,
empDamage,
Mat44D(..),
MMat44D(..),
scalarMat,
complMat,
freezeMats,
bwa_cal_maxdiff
) where
import Bio.Bam
import Bio.Prelude
import Bio.TwoBit
import qualified Data.Vector as V
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as UM
newtype Mat44D = Mat44D (U.Vector Double) deriving (Show, Generic)
newtype MMat44D = MMat44D (UM.IOVector Double)
type DamageModel = Bool -> Int -> Int -> Mat44D
data Subst = Nucleotide :-> Nucleotide deriving (Eq, Ord, Ix, Show)
infix 9 :->
infix 8 `bang`
bang :: Mat44D -> Subst -> Double
bang (Mat44D v) (N x :-> N y)
| U.length v == 16 = v U.! (fromIntegral x + 4 * fromIntegral y)
| otherwise = error $ "Huh? " ++ show (U.length v)
nudge :: MMat44D -> Subst -> Double -> IO ()
nudge (MMat44D v) (N x :-> N y) a = UM.read v i >>= UM.write v i . (+) a
where i = fromIntegral x + 4 * fromIntegral y
scalarMat :: Double -> Mat44D
scalarMat s = Mat44D $ U.fromListN 16 [ s, 0, 0, 0
, 0, s, 0, 0
, 0, 0, s, 0
, 0, 0, 0, s ]
complMat :: Mat44D -> Mat44D
complMat v = Mat44D $ U.fromListN 16 [ v `bang` compl x :-> compl y
| y <- range (nucA, nucT)
, x <- range (nucA, nucT) ]
freezeMats :: MMat44D -> MMat44D -> IO Mat44D
freezeMats (MMat44D vv) (MMat44D ww) = do
v <- Mat44D <$> U.freeze vv
w <- complMat . Mat44D <$> U.freeze ww
let sums = U.generate 4 $ \x0 ->
let x = N $ fromIntegral x0
in sum [ v `bang` x :-> z + w `bang` x :-> z
| z <- range (nucA, nucT) ] + 4
return . Mat44D $ U.fromListN 16
[ (v `bang` x :-> y + w `bang` x :-> y + 1) / s
| y <- range (nucA, nucT)
, x <- range (nucA, nucT)
, let s = sums U.! fromIntegral (unN x) ]
noDamage :: DamageModel
noDamage _ _ _ = one
where !one = scalarMat 1
data DamageParameters float = DP { ssd_sigma :: !float
, ssd_delta :: !float
, ssd_lambda :: !float
, ssd_kappa :: !float
, dsd_sigma :: !float
, dsd_delta :: !float
, dsd_lambda :: !float }
deriving (Read, Show, Generic)
data NewDamageParameters vec float = NDP { dp_gc_frac :: !float
, dp_mu :: !float
, dp_nu :: !float
, dp_alpha5 :: !(vec float)
, dp_beta5 :: !(vec float)
, dp_alpha :: !float
, dp_beta :: !float
, dp_alpha3 :: !(vec float)
, dp_beta3 :: !(vec float) }
deriving (Read, Show, Generic)
data GenDamageParameters vec float
= UnknownDamage
| OldDamage (DamageParameters float)
| NewDamage (NewDamageParameters vec float)
deriving (Show, Generic, Read)
genSubstMat :: Double -> Double -> Mat44D
genSubstMat p q = Mat44D $ U.fromListN 16 [ 1, 0, q, 0
, 0, 1p, 0, 0
, 0, 0, 1q, 0
, 0, p, 0, 1 ]
univDamage :: DamageParameters Double -> DamageModel
univDamage DP{..} r l i = genSubstMat (p1+p2) (q1+q2)
where
(p1, q1) = if r then let lam5 = ssd_lambda ^ (li)
lam3 = ssd_kappa ^ (1+i)
lam = lam3 + lam5 lam3 * lam5
p = ssd_sigma * lam + ssd_delta * (1lam)
in (0,p)
else let lam5 = ssd_lambda ^ (1+i)
lam3 = ssd_kappa ^ (li)
lam = lam3 + lam5 lam3 * lam5
p = ssd_sigma * lam + ssd_delta * (1lam)
in (p,0)
p2 = dsd_sigma * lam5_ds + dsd_delta * (1lam5_ds)
q2 = dsd_sigma * lam3_ds + dsd_delta * (1lam3_ds)
lam5_ds = dsd_lambda ^ (1+i)
lam3_ds = dsd_lambda ^ (li)
empDamage :: NewDamageParameters U.Vector Double -> DamageModel
empDamage NDP{..} =
\r l i -> if i+i < l then
if r then fromMaybe middleRev (rev5 V.!? i)
else fromMaybe middle (fwd5 V.!? i)
else
if r then fromMaybe middleRev (rev3 V.!? (li1))
else fromMaybe middle (fwd3 V.!? (li1))
where
!middle = genSubstMat' dp_alpha dp_beta
!middleRev = genSubstMat' dp_beta dp_alpha
!fwd5 = V.zipWith genSubstMat' (G.convert dp_alpha5) (G.convert dp_beta5)
!fwd3 = V.zipWith genSubstMat' (G.convert dp_alpha3) (G.convert dp_beta3)
!rev5 = V.zipWith genSubstMat' (G.convert dp_beta5) (G.convert dp_alpha5)
!rev3 = V.zipWith genSubstMat' (G.convert dp_beta3) (G.convert dp_alpha3)
genSubstMat' a b = genSubstMat (recip $ 1 + exp (a)) (recip $ 1 + exp (b))
data DmgStats a = DmgStats {
basecompo5 :: CompositionStats,
basecompo3 :: CompositionStats,
substs5 :: SubstitutionStats,
substs3 :: SubstitutionStats,
substs5d5 :: SubstitutionStats,
substs3d5 :: SubstitutionStats,
substs5d3 :: SubstitutionStats,
substs3d3 :: SubstitutionStats,
substs5dd :: SubstitutionStats,
substs3dd :: SubstitutionStats,
substs5cpg :: SubstitutionStats,
substs3cpg :: SubstitutionStats,
stats_more :: a }
deriving Show
type CompositionStats = [( Maybe Nucleotide, U.Vector Int )]
type SubstitutionStats = [( Subst, U.Vector Int )]
data FragType = Complete | Leading | Trailing deriving (Show, Eq)
type NPair = ( Nucleotides, Nucleotides )
data Alignment = ALN
{ a_sequence :: !(U.Vector NPair)
, a_fragment_type :: !FragType }
addFragType :: BamMeta -> Enumeratee [BamRaw] [(BamRaw,FragType)] m b
addFragType meta = mapStream $ \br -> (br, case unpackBam br of
b | isFirstMate b && isPaired b -> Leading
| isSecondMate b && isPaired b -> Trailing
| not sane -> Complete
| isFirstMate b || isSecondMate b -> Complete
| isTrimmed b || isMerged b -> Complete
| otherwise -> Leading)
where
sane = null [ () | ("PG",line) <- meta_other_shit meta
, ("PN","mergeTrimReadsBAM") <- line ]
damagePatternsIter2Bit :: MonadIO m
=> Refs -> TwoBitFile -> Int -> Int
-> Iteratee [Alignment] m b
-> Iteratee [(BamRaw,FragType)] m (DmgStats b)
damagePatternsIter2Bit refs tbf ctx rng it =
mapMaybeStream (\(br,ft) -> do
let b@BamRec{..} = unpackBam br
guard (not $ isUnmapped b)
let ref_nm = sq_name $ getRef refs b_rname
ref = getFragment tbf ref_nm (b_pos ctx) (alignedLength b_cigar + 2*ctx)
pps = aln_from_ref (U.drop ctx ref) b_seq b_cigar
return (b, ft, ref, pps)) =$
damagePatternsIter ctx rng it
damagePatternsIterMD :: MonadIO m
=> Int -> Iteratee [Alignment] m b
-> Iteratee [(BamRaw,FragType)] m (DmgStats b)
damagePatternsIterMD rng it =
mapMaybeStream (\(br,ft) -> do
let b@BamRec{..} = unpackBam br
guard (not $ isUnmapped b)
md <- getMd b
let pps = alnFromMd b_seq b_cigar md
ref = U.map fromN $ U.filter ((/=) gap . fst) pps
return (b, ft, ref, pps)) =$
damagePatternsIter 0 rng it
where
fromN (ns,_) | ns == nucsA = 2
| ns == nucsC = 1
| ns == nucsG = 3
| ns == nucsT = 0
| otherwise = 4
damagePatternsIter :: MonadIO m
=> Int -> Int
-> Iteratee [Alignment] m b
-> Iteratee [(BamRec, FragType, U.Vector Word8, U.Vector NPair)] m (DmgStats b)
damagePatternsIter ctx rng it = mapStream revcom_both =$ do
let maxwidth = ctx + rng
acc_bc <- liftIO $ UM.replicate (2 * 5 * maxwidth) (0::Int)
acc_st <- liftIO $ UM.replicate (2 * 4 * 4 * 4 * rng) (0::Int)
acc_cg <- liftIO $ UM.replicate (2 * 2 * 4 * rng) (0::Int)
it' <- flip mapStreamM it $ \(BamRec{..}, a_fragment_type, ref, a_sequence) -> liftIO $ do
#ifdef DEBUG
when (U.any (<0) ref || U.any (>4) ref) . error $
"Unexpected value in reference fragment: " ++ show ref
#endif
let good_pairs = U.indexed a_sequence
good_pairs_rev = U.indexed $ U.reverse a_sequence
let (width5, width3) = case a_fragment_type of
Leading -> (full_width, 0)
Trailing -> (0, full_width)
Complete -> (half_width, half_width)
where full_width = min (U.length ref) $ ctx + min rng (alignedLength b_cigar)
half_width = min (U.length ref) $ ctx + min rng (alignedLength b_cigar `div` 2)
mapM_ (\i -> bump (fromIntegral (ref U.! i ) * maxwidth + i) acc_bc) [0 .. width51]
mapM_ (\i -> bump (fromIntegral (ref U.! (i + U.length ref) +6) * maxwidth + i) acc_bc) [width3 .. 1]
let dmgbase = 2*4*4*rng * ( (if U.null a_sequence || U.head a_sequence /= (nucsC,nucsT) then 1 else 0)
+ (if U.null a_sequence || U.last a_sequence /= (nucsC,nucsT) then 2 else 0) )
let len_at_5 = case a_fragment_type of Leading -> min rng (G.length b_seq)
Complete -> min rng (G.length b_seq `div` 2)
Trailing -> 0
U.forM_ (U.take len_at_5 good_pairs) $
\(i,uv) -> withPair uv $ \j -> bump (j * rng + i + dmgbase) acc_st
U.zipWithM_
(\(i,(u,v)) (_,(w,z)) ->
when (u == nucsC && w == nucsG) $ do
withNs v $ \y -> bump ( y * rng + i ) acc_cg
withNs z $ \y -> bump ((y+4) * rng + i+1) acc_cg)
(U.take len_at_5 good_pairs) (U.drop 1 good_pairs)
let len_at_3 = case a_fragment_type of Leading -> 0
Complete -> min rng (G.length b_seq `div` 2)
Trailing -> min rng (G.length b_seq)
U.forM_ (U.take len_at_3 good_pairs_rev) $
\(i,uv) -> withPair uv $ \j -> bump ((17+j) * rng i 1 + dmgbase) acc_st
U.zipWithM_
(\(_,(u,v)) (i,(w,z)) ->
when (u == nucsC && w == nucsG) $ do
withNs v $ \y -> bump ((y+ 9) * rng i2) acc_cg
withNs z $ \y -> bump ((y+13) * rng i1) acc_cg)
(U.drop 1 good_pairs_rev) (U.take len_at_3 good_pairs_rev)
return ALN{..}
let nsubsts = 4*4*rng
mk_substs off = sequence [ (,) (n1 :-> n2) <$> U.unsafeFreeze (UM.slice ((4*i+j)*rng + off*nsubsts) rng acc_st)
| (i,n1) <- zip [0..] [nucA..nucT]
, (j,n2) <- zip [0..] [nucA..nucT] ]
accs <- liftIO $ DmgStats <$> sequence [ (,) nuc <$> U.unsafeFreeze (UM.slice (i*maxwidth) maxwidth acc_bc)
| (i,nuc) <- zip [2,1,3,0,4] [Just nucA, Just nucC, Just nucG, Just nucT, Nothing] ]
<*> sequence [ (,) nuc <$> U.unsafeFreeze (UM.slice (i*maxwidth) maxwidth acc_bc)
| (i,nuc) <- zip [7,6,8,5,9] [Just nucA, Just nucC, Just nucG, Just nucT, Nothing] ]
<*> mk_substs 0
<*> mk_substs 1
<*> mk_substs 2
<*> mk_substs 3
<*> mk_substs 4
<*> mk_substs 5
<*> mk_substs 6
<*> mk_substs 7
<*> sequence [ (,) (n1 :-> n2) <$> U.unsafeFreeze (UM.slice ((i+j)*rng) rng acc_cg)
| (i,n1) <- [(0,nucC), (4,nucG)]
, (j,n2) <- zip [0..] [nucA..nucT] ]
<*> sequence [ (,) (n1 :-> n2) <$> U.unsafeFreeze (UM.slice ((i+j)*rng) rng acc_cg)
| (i,n2) <- [(8,nucC), (12,nucG)]
, (j,n1) <- zip [0..] [nucA..nucT] ]
accs' <- accs `liftM` lift (run it')
return $ accs' { substs5 = mconcat [ substs5 accs', substs5d5 accs', substs5d3 accs', substs5dd accs' ]
, substs3 = mconcat [ substs3 accs', substs3d5 accs', substs3d3 accs', substs3dd accs' ]
, substs5d5 = mconcat [ substs5d5 accs', substs5dd accs']
, substs3d5 = mconcat [ substs3d5 accs', substs3dd accs']
, substs5d3 = mconcat [ substs5d3 accs', substs5dd accs']
, substs3d3 = mconcat [ substs3d3 accs', substs3dd accs'] }
where
withPair (Ns u, Ns v) k = case pairTab `U.unsafeIndex` fromIntegral (16*u+v) of
j -> if j >= 0 then k j else return ()
!pairTab = U.replicate 256 (1) U.//
[ (fromIntegral $ 16*u+v, x*4+y) | (Ns u,x) <- zip [nucsA, nucsC, nucsG, nucsT] [0,1,2,3]
, (Ns v,y) <- zip [nucsA, nucsC, nucsG, nucsT] [0,1,2,3] ]
#ifdef DEBUG
bump i v = UM.read v i >>= UM.write v i . succ
#else
bump i v = UM.unsafeRead v i >>= UM.unsafeWrite v i . succ
#endif
withNs ns k | ns == nucsA = k 0
| ns == nucsC = k 1
| ns == nucsG = k 2
| ns == nucsT = k 3
| otherwise = return ()
instance Monoid a => Monoid (DmgStats a) where
mempty = DmgStats { basecompo5 = empty_compo
, basecompo3 = empty_compo
, substs5 = empty_subst
, substs3 = empty_subst
, substs5d5 = empty_subst
, substs3d5 = empty_subst
, substs5d3 = empty_subst
, substs3d3 = empty_subst
, substs5dd = empty_subst
, substs3dd = empty_subst
, substs5cpg = empty_subst
, substs3cpg = empty_subst
, stats_more = mempty }
where
empty_compo = [ (nuc, U.empty) | nuc <- [Just nucA, Just nucC, Just nucG, Just nucT, Nothing] ]
empty_subst = [ (n1 :-> n2, U.empty) | n1 <- [nucA..nucT], n2 <- [nucA..nucT] ]
a `mappend` b = DmgStats { basecompo5 = zipWith s1 (basecompo5 a) (basecompo5 b)
, basecompo3 = zipWith s1 (basecompo3 a) (basecompo3 b)
, substs5 = zipWith s2 (substs5 a) (substs5 b)
, substs3 = zipWith s2 (substs3 a) (substs3 b)
, substs5d5 = zipWith s2 (substs5d5 a) (substs5d5 b)
, substs3d5 = zipWith s2 (substs3d5 a) (substs3d5 b)
, substs5d3 = zipWith s2 (substs5d3 a) (substs5d3 b)
, substs3d3 = zipWith s2 (substs3d3 a) (substs3d3 b)
, substs5dd = zipWith s2 (substs5dd a) (substs5dd b)
, substs3dd = zipWith s2 (substs3dd a) (substs3dd b)
, substs5cpg = zipWith s2 (substs5cpg a) (substs5cpg b)
, substs3cpg = zipWith s2 (substs3cpg a) (substs3cpg b)
, stats_more = mappend (stats_more a) (stats_more b) }
where
s1 (x, u) (z, v) | x /= z = error "Mismatch in zip. This is a bug."
| U.null u = (x, v)
| U.null v = (x, u)
| otherwise = (x, U.zipWith (+) u v)
s2 (x :-> y, u) (z :-> w, v) | x /= z || y /= w = error "Mismatch in zip. This is a bug."
| U.null u = (x :-> y, v)
| U.null v = (x :-> y, u)
| otherwise = (x :-> y, U.zipWith (+) u v)
revcom_both :: ( BamRec, FragType, U.Vector Word8, U.Vector (Nucleotides, Nucleotides) )
-> ( BamRec, FragType, U.Vector Word8, U.Vector (Nucleotides, Nucleotides) )
revcom_both (b, ft, ref, pps)
| isReversed b = ( b, ft, revcom_ref ref, revcom_pairs pps )
| otherwise = ( b, ft, ref, pps )
where
revcom_ref = U.reverse . U.map (\c -> if c > 3 then c else xor c 2)
revcom_pairs = U.reverse . U.map (compls *** compls)
aln_from_ref :: U.Vector Word8 -> Vector_Nucs_half Nucleotides -> VS.Vector Cigar -> U.Vector NPair
aln_from_ref ref0 qry0 cig0 = U.fromList $ step ref0 qry0 cig0
where
step ref qry cig1
| U.null ref || G.null qry || G.null cig1 = []
| otherwise = case G.unsafeHead cig1 of { op :* n ->
case G.unsafeTail cig1 of { cig ->
case op of {
Mat -> zipWith (\r q -> (nn r,q)) (G.toList (G.take n ref))
(G.toList (G.take n qry)) ++ step (G.drop n ref) (G.drop n qry) cig ;
Del -> step (G.drop n ref) qry cig ;
Ins -> map (\q -> ( gap, q )) (G.toList (G.take n qry)) ++ step ref (G.drop n qry) cig ;
SMa -> map (\q -> ( gap, q )) (G.toList (G.take n qry)) ++ step ref (G.drop n qry) cig ;
HMa -> replicate n (gap, nucsN) ++ step ref qry cig ;
Nop -> step ref qry cig ;
Pad -> step ref qry cig }}}
nn 0 = nucsT
nn 1 = nucsC
nn 2 = nucsA
nn 3 = nucsG
nn _ = nucsN
alnFromMd :: Vector_Nucs_half Nucleotides -> VS.Vector Cigar -> [MdOp] -> U.Vector NPair
alnFromMd qry0 cig0 md0 = U.fromList $ step qry0 cig0 md0
where
step qry cig1 md
| G.null qry || G.null cig1 || null md = []
| otherwise = case G.unsafeHead cig1 of op :* n -> step' qry op n (G.unsafeTail cig1) md
step' qry _ 0 cig md = step qry cig md
step' qry op n cig (MdNum 0 : md) = step' qry op n cig md
step' qry op n cig (MdDel [] : md) = step' qry op n cig md
step' qry Mat n cig (MdNum m : md)
| n < m = map (\q -> (q,q)) (G.toList (G.take n qry)) ++ step (G.drop n qry) cig (MdNum (mn) : md)
| n > m = map (\q -> (q,q)) (G.toList (G.take m qry)) ++ step' (G.drop m qry) Mat (nm) cig md
| n == m = map (\q -> (q,q)) (G.toList (G.take n qry)) ++ step (G.drop n qry) cig md
step' qry Mat n cig (MdRep c : md) = ( c, G.head qry ) : step' (G.tail qry) Mat (n1) cig md
step' _ Mat _ _ _ = []
step' qry Del n cig (MdDel (_:ss) : md) = step' qry Del (n1) cig (MdDel ss : md)
step' _ Del _ _ _ = []
step' qry Ins n cig md = map ((,) gap) (G.toList (G.take n qry)) ++ step (G.drop n qry) cig md
step' qry SMa n cig md = map ((,) gap) (G.toList (G.take n qry)) ++ step (G.drop n qry) cig md
step' qry HMa n cig md = replicate n (gap, nucsN) ++ step qry cig md
step' qry Nop _ cig md = step qry cig md
step' qry Pad _ cig md = step qry cig md
bwa_cal_maxdiff :: Double -> Int -> Int
bwa_cal_maxdiff thresh len = k_fin1
where
(k_fin, _, _, _) : _ = dropWhile bad $ iterate step (1,elambda,1,1)
err = 0.02
elambda = exp . negate $ fromIntegral len * err
step (k, s, x, y) = (k+1, s', x', y')
where y' = y * fromIntegral len * err
x' = x * fromIntegral k
s' = s + elambda * y' / x'
bad (_, s, _, _) = 1s >= thresh