{-# LANGUAGE RecordWildCards, NamedFieldPuns, BangPatterns, TypeFamilies #-} -- Estimates aDNA damage. Crude first version. -- -- - Read or subsample a BAM file, make compact representation of the reads. -- - Compute likelihood of each read under simple model of -- damage, error/divergence, contamination. -- -- For the fitting, we simplify radically: ignore sequencing error, -- assume damage and simple, symmetric substitutions which subsume error -- and divergence. -- -- Trying to compute symbolically is too much, the high power terms get -- out of hand quickly, and we get mixed powers of \lambda and \kappa. -- The fastest version so far uses the cheap implementation of automatic -- differentiation in AD.hs together with the Hager-Zhang method from -- package nonlinear-optimization. BFGS from hmatrix-gsl takes longer -- to converge. Didn't try an actual Newton iteration (yet?), AD from -- package ad appears slower. -- -- If I include parameters, whose true value is zero, the transformation -- to the log-odds-ratio doesn't work, because then the maximum doesn't -- exist anymore. For many parameters, zero makes sense, but one -- doesn't. A different transformation ('sigmoid2'/'isigmoid2' -- below) allows for an actual zero (but not an actual one), while -- avoiding ugly boundary conditions. That appears to work well. -- -- The current hack assumes all molecules have an overhang at both ends, -- then each base gets deaminated with a position dependent probability -- following a geometric distribution. If we try to model a fraction of -- undeaminated molecules (a contaminant) in addition, this fails. To -- rescue the idea, I guess we must really decide if the molecule has an -- overhang at all (probability 1/2) at each end, then deaminate it. -- -- TODO -- - needs better output -- - needs support for multiple input files -- - needs to deal with long (unmerged) reads (by ignoring them?) import Bio.Bam.Header import Bio.Bam.Index import Bio.Bam.Rec import Bio.Base import Bio.Genocall.Adna import Bio.Genocall.Metadata import Bio.Iteratee import Bio.Util.AD import Bio.Util.AD2 import Bio.Util.Numeric import Control.Applicative import Control.Concurrent.Async import Control.Monad ( unless ) import Data.Bits import Data.Foldable import Data.Ix import Data.Maybe import Data.String ( fromString ) import Data.Text ( unpack ) import System.Console.GetOpt import System.Environment import System.Exit import System.FilePath import System.IO ( hPutStrLn ) import qualified Data.HashMap.Strict as M import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import Prelude hiding ( sequence_, mapM, mapM_, concatMap, sum, minimum, foldr1, foldl ) -- | Roughly @Maybe (Nucleotide, Nucleotide)@, encoded compactly newtype NP = NP { unNP :: Word8 } deriving (Eq, Ord, Ix) data Seq = Merged { unSeq :: U.Vector Word8 } | Mate1st { unSeq :: U.Vector Word8 } | Mate2nd { unSeq :: U.Vector Word8 } instance Show NP where show (NP w) | w == 16 = "NN" | w > 16 = "XX" | otherwise = [ "ACGT" !! fromIntegral (w `shiftR` 2) , "ACGT" !! fromIntegral (w .&. 3) ] {-# INLINE lk_fun1 #-} lk_fun1 :: (Num a, Show a, Fractional a, Floating a, Memorable a) => Int -> Int -> [a] -> V.Vector Seq -> a lk_fun1 lmin lmax parms = case length parms of 1 -> V.foldl' (\a b -> a - log (lk tab00 tab00 tab00 b)) 0 . guardV -- undamaged case where !tab00 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 0 x | (_,_,x) <- range my_bounds ] 4 -> V.foldl' (\a b -> a - log (lk tabDS tabDS1 tabDS1 b)) 0 . guardV -- double strand case where !tabDS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d p_e x | (l,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) , let p_e = mu $ lambda ^^ (l-i) ] !tabDS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x | (_,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) ] 5 -> V.foldl' (\a b -> a - log (lk tabSS tabSS1 tabSS2 b)) 0 . guardV -- single strand case where !tabSS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x | (l,i,x) <- range my_bounds , let lam5 = lambda ^^ (1+i) ; lam3 = kappa ^^ (l-i) , let p_d = mu $ lam3 + lam5 - lam3 * lam5 ] !tabSS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x | (_,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) ] !tabSS2 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 p_d x | (_,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) ] _ -> error "Not supposed to happen: unexpected number of model parameters." where ~(l_subst : ~(l_sigma : ~(l_delta : ~(l_lam : ~(l_kap : _))))) = parms p_subst = 0.33333 * sigmoid2 l_subst sigma = sigmoid2 l_sigma delta = sigmoid2 l_delta lambda = sigmoid2 l_lam kappa = sigmoid2 l_kap guardV = V.filter (\u -> U.length (unSeq u) >= lmin && U.length (unSeq u) <= lmax) -- Likelihood given precomputed damage table. We compute the giant -- table ahead of time, which maps length, index and base pair to a -- likelihood. lk tab_m _ _ (Merged b) = U.ifoldl' (\a i np -> a * tab_m `bang` index' my_bounds (U.length b, i, NP np)) 1 b lk _ tab_f _ (Mate1st b) = U.ifoldl' (\a i np -> a * tab_f `bang` index' my_bounds (U.length b, i, NP np)) 1 b lk _ _ tab_s (Mate2nd b) = U.ifoldl' (\a i np -> a * tab_s `bang` index' my_bounds (U.length b, i, NP np)) 1 b index' bnds x | inRange bnds x = index bnds x | otherwise = error $ "Huh? " ++ show x ++ " \\nin " ++ show bnds my_bounds = ((lmin,0,NP 0),(lmax,lmax,NP 16)) mu p = sigma * p + delta * (1-p) -- Likelihood for a certain pair of bases given error rate, C-T-rate -- and G-A rate. l_epq :: (Num a, Fractional a, Floating a) => a -> a -> a -> NP -> a l_epq e p q (NP x) = case x of { 0 -> s ; 1 -> e ; 2 -> e ; 3 -> e ; 4 -> e ; 5 -> s-p+4*e*p ; 6 -> e ; 7 -> e+p-4*e*p ; 8 -> e+q-4*e*q ; 9 -> e ; 10 -> s-q+4*e*q ; 11 -> e ; 12 -> e ; 13 -> e ; 14 -> e ; 15 -> s ; _ -> 1 } where s = 1 - 3 * e lkfun :: Int -> Int -> V.Vector Seq -> U.Vector Double -> Double lkfun lmin lmax brs parms = lk_fun1 lmin lmax (U.toList parms) brs lkfun' :: Int -> Int -> V.Vector Seq -> [Double] -> AD lkfun' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector parms) brs lkfun'' :: Int -> Int -> V.Vector Seq -> [Double] -> AD2 lkfun'' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector2 parms) brs combofn :: Int -> Int -> V.Vector Seq -> U.Vector Double -> (Double, U.Vector Double) combofn lmin lmax brs parms = (x,g) where D x g = lk_fun1 lmin lmax (paramVector $ U.toList parms) brs data Conf = Conf { conf_lmin :: Int, conf_metadata :: FilePath, conf_report :: String -> IO (), conf_params :: Parameters } defaultConf :: Conf defaultConf = Conf 25 (error "no config file specified") (\_ -> return ()) quietParameters options :: [OptDescr (Conf -> IO Conf)] options = [ Option "m" ["min-length"] (ReqArg set_lmin "LEN") "Set minimum length to LEN (25)", Option "c" ["config"] (ReqArg set_conf "FILE") "Configuiration is stored in FILE", Option "v" ["verbose"] (NoArg set_verbose) "Print progress reports", Option "h?" ["help","usage"] (NoArg disp_usage) "Print this message and exit" ] where set_lmin a c = readIO a >>= \l -> return $ c { conf_lmin = l } set_conf f c = return $ c { conf_metadata = f } set_verbose c = return $ c { conf_report = hPutStrLn stderr, conf_params = debugParameters } disp_usage _ = do pn <- getProgName let blah = "Usage: " ++ pn ++ " [OPTION...] [LIBRARY-NAME...]" putStrLn $ usageInfo blah options exitSuccess main :: IO () main = do (opts, lnames, errors) <- getOpt Permute options <$> getArgs unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure conf <- foldl (>>=) (return defaultConf) opts mapM_ (main' conf) lnames main' :: Conf -> String -> IO () main' Conf{..} lname = do [Library _ fs _] <- return . filter ((fromString lname ==) . library_name) . concatMap sample_libraries . M.elems =<< readMetadata conf_metadata -- XXX meh. subsampling from multiple files is not yet supported :( brs <- subsampleBam (takeDirectory conf_metadata unpack (head fs)) >=> run $ \_ -> joinI $ filterStream (\b -> not (isUnmapped (unpackBam b)) && G.length (b_seq (unpackBam b)) >= conf_lmin) $ joinI $ takeStream 100000 $ joinI $ mapStream pack_record $ joinI $ filterStream (\u -> U.length (U.filter (<16) (unSeq u)) * 10 >= 9 * U.length (unSeq u)) $ stream2vectorN 30000 let lmax = V.maximum $ V.map (U.length . unSeq) brs v0 = crude_estimate brs opt v = optimize conf_params 0.0001 v (VFunction $ lkfun conf_lmin lmax brs) (VGradient $ snd . combofn conf_lmin lmax brs) (Just . VCombined $ combofn conf_lmin lmax brs) results <- mapConcurrently opt [ v0, U.take 4 v0, U.take 1 v0 ] let mlk = minimum [ finalValue st | (_,_,st) <- results ] tot = sum [ exp $ mlk - finalValue st | (_,_,st) <- results ] p l = exp (mlk - l) / tot [ (p_ss, [ _, ssd_sigma_, ssd_delta_, ssd_lambda, ssd_kappa ]), (p_ds, [ _, dsd_sigma_, dsd_delta_, dsd_lambda ]), (_ , [ _ ]) ] = [ (p (finalValue st), map sigmoid2 $ G.toList xs) | (xs,_,st) <- results ] ssd_sigma = p_ss * ssd_sigma_ ssd_delta = p_ss * ssd_delta_ dsd_sigma = p_ds * dsd_sigma_ dsd_delta = p_ds * dsd_delta_ putStrLn $ "p_{ss} = " ++ show p_ss ++ ", p_{ds} = " ++ show p_ds putStrLn $ show DP{..} updateMetadata (store_dp lname DP{..}) conf_metadata -- Trying to get confidence intervals. Right now, just get the -- gradient and Hessian at the ML point. Gradient should be nearly -- zero, Hessian should be symmetric and positive definite. -- (Remember, we minimized.) mapM_ print [ (r,s) | (_,r,s) <- results ] putStrLn "" mapM_ print [ lkfun' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ] putStrLn "" mapM_ print [ lkfun'' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ] -- We'll require the MD field to be present. Then we cook each read -- into a list of paired bases. Deleted bases are dropped, inserted -- bases replaced with an escape code. -- -- XXX This is annoying... almost, but not quite the same as the code -- in the "Pileup" module. This also relies on MD and doesn't offer the -- alternative of accessing a reference genome. (The latter may not be -- worth the trouble.) It also resembles the 'ECig' logic from -- "Bio.Bam.Rmdup". pack_record :: BamRaw -> Seq pack_record br = if isReversed b then k (revcom u1) else k u1 where b@BamRec{..} = unpackBam br k | isMerged b = Merged | isTrimmed b = Merged | isSecondMate b = Mate2nd | otherwise = Mate1st revcom = U.reverse . U.map (\x -> if x > 15 then x else xor x 15) u1 = U.fromList . map unNP $ go (G.toList b_cigar) (G.toList b_seq) (fromMaybe [] $ getMd b) go :: [Cigar] -> [Nucleotides] -> [MdOp] -> [NP] go (_:*0 :cs) ns mds = go cs ns mds go cs ns (MdNum 0:mds) = go cs ns mds go cs ns (MdDel []:mds) = go cs ns mds go _ [] _ = [] go [] _ _ = [] go (Mat:*nm :cs) (n:ns) (MdNum mm:mds) = mk_pair n n : go (Mat:*(nm-1):cs) ns (MdNum (mm-1):mds) go (Mat:*nm :cs) (n:ns) (MdRep n':mds) = mk_pair n n' : go (Mat:*(nm-1):cs) ns mds go (Mat:*nm :cs) ns (MdDel _ :mds) = go (Mat:* nm :cs) ns mds go (Ins:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds go (SMa:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds go (Del:*nm :cs) ns (MdDel (_:ds):mds) = go (Del:*(nm-1):cs) ns (MdDel ds:mds) go (Del:*nm :cs) ns ( _:mds) = go (Del:* nm :cs) ns mds go (_:cs) nd mds = go cs nd mds esc :: NP esc = NP 16 mk_pair :: Nucleotides -> Nucleotides -> NP mk_pair (Ns a) = case a of 1 -> mk_pair' 0 2 -> mk_pair' 1 4 -> mk_pair' 2 8 -> mk_pair' 3 _ -> const esc where mk_pair' u (Ns b) = case b of 1 -> NP $ u .|. 0 2 -> NP $ u .|. 4 4 -> NP $ u .|. 8 8 -> NP $ u .|. 12 _ -> esc infix 7 /%/ (/%/) :: Integral a => a -> a -> Double 0 /%/ 0 = 0 a /%/ b = fromIntegral a / fromIntegral b -- Crude estimate. Need two overhang lengths, two deamination rates, -- undamaged fraction, SS/DS, substitution rate. -- -- DS or SS: look whether CT or GA is greater at 3' terminal position √ -- Left overhang length: ratio of damage at second position to first √ -- Right overang length: ratio of CT at last to snd-to-last posn √ -- + ratio of GA at last to snd-to-last posn √ -- SS rate: condition on damage on one end, compute rate at other √ -- DS rate: condition on damage, compute rate in interior √ -- substitution rate: count all substitutions not due to damage √ -- undamaged fraction: see below √ -- -- Contaminant fraction: let f5 (f3, f1) be the fraction of reads -- showing damage at the 5' end (3' end, both ends). Let a (b) be -- the probability of an endogenous reads to show damage at the 5' -- end (3' end). Let e be the fraction of endogenous reads. Then -- we have: -- -- f5 = e * a -- f3 = e * b -- f1 = e * a * b -- -- f5 * f3 / f1 = e -- -- Straight forward and easy to understand, but in practice, this method -- produces ridiculous overestimates, ridiculous underestimates, -- negative contamination rates, and general grief. It's actually -- better to start from a constant number. crude_estimate :: V.Vector Seq -> U.Vector Double crude_estimate seqs0 = U.fromList [ l_subst, l_sigma, l_delta, l_lam, l_kap ] where seqs = V.filter ((>= 10) . U.length) $ V.map unSeq seqs0 total_equals = V.sum (V.map (U.length . U.filter isNotSubst) seqs) total_substs = V.sum (V.map (U.length . U.filter isOrdinarySubst) seqs) * 6 `div` 5 l_subst = isigmoid2 $ max 0.001 $ total_substs /%/ (total_equals + total_substs) c_to_t, g_to_a, c_to_c :: Word8 c_to_t = 7 g_to_a = 8 c_to_c = 5 isNotSubst x = x < 16 && x `shiftR` 2 == x .&. 3 isOrdinarySubst x = x < 16 && x `shiftR` 2 /= x .&. 3 && x /= c_to_t && x /= g_to_a ct_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_t && dmg_omega v) seqs cc_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_c && dmg_omega v) seqs ct_at_beta = V.length $ V.filter (\v -> v U.! 1 == c_to_t && dmg_omega v) seqs cc_at_beta = V.length $ V.filter (\v -> v U.! 1 == c_to_c && dmg_omega v) seqs dmg_omega v = v U.! (l-1) == c_to_t || v U.! (l-1) == g_to_a || v U.! (l-2) == c_to_t || v U.! (l-2) == g_to_a || v U.! (l-3) == c_to_t || v U.! (l-3) == g_to_a where l = U.length v l_lam = isigmoid2 lambda lambda = min 0.9 $ max 0.1 $ (ct_at_beta * (cc_at_alpha + ct_at_alpha)) /%/ ((cc_at_beta + ct_at_beta) * ct_at_alpha) ct_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_t && dmg_alpha v) seqs cc_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_c && dmg_alpha v) seqs ct_at_psi = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_t && dmg_alpha v) seqs cc_at_psi = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_c && dmg_alpha v) seqs dmg_alpha v = v U.! 0 == c_to_t || v U.! 1 == c_to_t || v U.! 2 == c_to_t l_kap = isigmoid2 $ min 0.9 $ max 0.1 $ (ct_at_psi * (cc_at_omega+ct_at_omega)) /%/ ((cc_at_psi+ct_at_psi) * ct_at_omega) total_inner_CCs = V.sum $ V.map (U.length . U.filter (== c_to_c) . takeInner) seqs total_inner_CTs = V.sum $ V.map (U.length . U.filter (== c_to_t) . takeInner) seqs takeInner v = U.slice 5 (U.length v - 10) v delta = (total_inner_CTs /%/ (total_inner_CTs+total_inner_CCs)) raw_rate = ct_at_alpha /%/ (ct_at_alpha + cc_at_alpha) -- clamping is necessary if f_endo ends up wrong l_delta = isigmoid2 $ min 0.99 delta l_sigma = isigmoid2 . min 0.99 $ raw_rate / lambda class Memorable a where type Memo a :: * fromListN :: Int -> [a] -> Memo a bang :: Memo a -> Int -> a instance Memorable Double where type Memo Double = U.Vector Double fromListN = U.fromListN bang = (U.!) instance Memorable AD where type Memo AD = (Int, U.Vector Double) fromListN _ [ ] = error "unexpected: tried to memorize an empty list" fromListN _ (C _ :_) = error "unexpected: tried to memorize a value without derivatives" fromListN n xs@(D _ v:_) = (1+d, U.fromListN (n * (1+d)) $ concatMap unp xs) where !d = U.length v unp (C a) = a : replicate d 0 unp (D a da) = a : U.toList da bang (d, v) i = D (v U.! (d*i+0)) (U.slice (d*i+1) (d-1) v) instance Memorable AD2 where type Memo AD2 = (Int, U.Vector Double) fromListN _ [ ] = error "unexpected: tried to memorize an empty list" fromListN _ (C2 _ : _) = error "unexpected: tried to memorize a value without derivatives" fromListN n xs@(D2 _ v _ : _) = (d, U.fromListN (n * (1+d+d*d)) $ concatMap unp xs) where !d = U.length v unp (C2 a) = a : replicate (d+d*d) 0 unp (D2 a da dda) = a : U.toList da ++ U.toList dda bang (d, v) i = D2 (v U.! (stride*i)) (U.slice (stride*i+1) d v) (U.slice (stride*i+1+d) (d*d) v) where stride = 1 + d + d*d store_dp :: String -> DamageParameters Double -> Metadata -> Metadata store_dp lname dp = M.map go1 where go1 (Sample ls af bf ts dv) = Sample (map go2 ls) af bf ts dv go2 (Library nm fs dmg) | nm == fromString lname = Library nm fs (Just dp) | otherwise = Library nm fs dmg