{-# LANGUAGE RecordWildCards, NamedFieldPuns, BangPatterns, TypeFamilies #-} -- Estimates aDNA damage. Crude first version. -- -- - Read or subsample a BAM file, make compact representation of the reads. -- - Compute likelihood of each read under simple model of -- damage, error/divergence, contamination. -- -- For the fitting, we simplify radically: ignore sequencing error, -- assume damage and simple, symmetric substitutions which subsume error -- and divergence. -- -- Trying to compute symbolically is too much, the high power terms get -- out of hand quickly, and we get mixed powers of \lambda and \kappa. -- The fastest version so far uses the cheap implementation of automatic -- differentiation in AD.hs together with the Hager-Zhang method from -- package nonlinear-optimization. BFGS from hmatrix-gsl takes longer -- to converge. Didn't try an actual Newton iteration (yet?), AD from -- package ad appears slower. -- -- If I include parameters, whose true value is zero, the transformation -- to the log-odds-ratio doesn't work, because then the maximum doesn't -- exist anymore. For many parameters, zero makes sense, but one -- doesn't. A different transformation ('sigmoid2'/'isigmoid2' -- below) allows for an actual zero (but not an actual one), while -- avoiding ugly boundary conditions. That appears to work well. -- -- The current hack assumes all molecules have an overhang at both ends, -- then each base gets deaminated with a position dependent probability -- following a geometric distribution. If we try to model a fraction of -- undeaminated molecules (a contaminant) in addition, this fails. To -- rescue the idea, I guess we must really decide if the molecule has an -- overhang at all (probability 1/2) at each end, then deaminate it. -- -- TODO -- - needs better output -- - needs support for multiple input files -- - needs to deal with long (unmerged) reads (by ignoring them?) import Bio.Adna hiding ( bang ) import Bio.Bam.Header import Bio.Bam.Index import Bio.Bam.Rec import Bio.Base import Bio.Genocall.Metadata import Bio.Iteratee import Bio.Util.AD import Bio.Util.AD2 import Bio.Util.Numeric import Control.Applicative import Control.Concurrent.Async import Control.Monad ( unless ) import Data.Bits import Data.Foldable import Data.Ix import Data.Maybe import Data.String ( fromString ) import Data.Text ( unpack ) import System.Console.GetOpt import System.Environment import System.Exit import System.FilePath import System.IO ( hPutStrLn ) import qualified Data.HashMap.Strict as M import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import Prelude hiding ( sequence_, mapM, mapM_, concatMap, sum, minimum, foldr1, foldl ) -- | Roughly @Maybe (Nucleotide, Nucleotide)@, encoded compactly newtype NP = NP { unNP :: Word8 } deriving (Eq, Ord, Ix) data Seq = Merged { unSeq :: U.Vector Word8 } | Mate1st { unSeq :: U.Vector Word8 } | Mate2nd { unSeq :: U.Vector Word8 } instance Show NP where show (NP w) | w == 16 = "NN" | w > 16 = "XX" | otherwise = [ "ACGT" !! fromIntegral (w `shiftR` 2) , "ACGT" !! fromIntegral (w .&. 3) ] {-# INLINE lk_fun1 #-} lk_fun1 :: (Num a, Show a, Fractional a, Floating a, Memorable a) => Int -> Int -> [a] -> V.Vector Seq -> a lk_fun1 lmin lmax parms = case length parms of 1 -> V.foldl' (\a b -> a - log (lk tab00 tab00 tab00 b)) 0 . guardV -- undamaged case where !tab00 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 0 x | (_,_,x) <- range my_bounds ] 4 -> V.foldl' (\a b -> a - log (lk tabDS tabDS1 tabDS1 b)) 0 . guardV -- double strand case where !tabDS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d p_e x | (l,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) , let p_e = mu $ lambda ^^ (l-i) ] !tabDS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x | (_,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) ] 5 -> V.foldl' (\a b -> a - log (lk tabSS tabSS1 tabSS2 b)) 0 . guardV -- single strand case where !tabSS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x | (l,i,x) <- range my_bounds , let lam5 = lambda ^^ (1+i) ; lam3 = kappa ^^ (l-i) , let p_d = mu $ lam3 + lam5 - lam3 * lam5 ] !tabSS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x | (_,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) ] !tabSS2 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 p_d x | (_,i,x) <- range my_bounds , let p_d = mu $ lambda ^^ (1+i) ] _ -> error "Not supposed to happen: unexpected number of model parameters." where ~(l_subst : ~(l_sigma : ~(l_delta : ~(l_lam : ~(l_kap : _))))) = parms p_subst = 0.33333 * sigmoid2 l_subst sigma = sigmoid2 l_sigma delta = sigmoid2 l_delta lambda = sigmoid2 l_lam kappa = sigmoid2 l_kap guardV = V.filter (\u -> U.length (unSeq u) >= lmin && U.length (unSeq u) <= lmax) -- Likelihood given precomputed damage table. We compute the giant -- table ahead of time, which maps length, index and base pair to a -- likelihood. lk tab_m _ _ (Merged b) = U.ifoldl' (\a i np -> a * tab_m `bang` index' my_bounds (U.length b, i, NP np)) 1 b lk _ tab_f _ (Mate1st b) = U.ifoldl' (\a i np -> a * tab_f `bang` index' my_bounds (U.length b, i, NP np)) 1 b lk _ _ tab_s (Mate2nd b) = U.ifoldl' (\a i np -> a * tab_s `bang` index' my_bounds (U.length b, i, NP np)) 1 b index' bnds x | inRange bnds x = index bnds x | otherwise = error $ "Huh? " ++ show x ++ " \\nin " ++ show bnds my_bounds = ((lmin,0,NP 0),(lmax,lmax,NP 16)) mu p = sigma * p + delta * (1-p) -- Likelihood for a certain pair of bases given error rate, C-T-rate -- and G-A rate. l_epq :: (Num a, Fractional a, Floating a) => a -> a -> a -> NP -> a l_epq e p q (NP x) = case x of { 0 -> s ; 1 -> e ; 2 -> e ; 3 -> e ; 4 -> e ; 5 -> s-p+4*e*p ; 6 -> e ; 7 -> e+p-4*e*p ; 8 -> e+q-4*e*q ; 9 -> e ; 10 -> s-q+4*e*q ; 11 -> e ; 12 -> e ; 13 -> e ; 14 -> e ; 15 -> s ; _ -> 1 } where s = 1 - 3 * e lkfun :: Int -> Int -> V.Vector Seq -> U.Vector Double -> Double lkfun lmin lmax brs parms = lk_fun1 lmin lmax (U.toList parms) brs lkfun' :: Int -> Int -> V.Vector Seq -> [Double] -> AD lkfun' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector parms) brs lkfun'' :: Int -> Int -> V.Vector Seq -> [Double] -> AD2 lkfun'' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector2 parms) brs combofn :: Int -> Int -> V.Vector Seq -> U.Vector Double -> (Double, U.Vector Double) combofn lmin lmax brs parms = (x,g) where D x g = lk_fun1 lmin lmax (paramVector $ U.toList parms) brs data Conf = Conf { conf_lmin :: Int, conf_metadata :: FilePath, conf_report :: String -> IO (), conf_params :: Parameters } defaultConf :: Conf defaultConf = Conf 25 (error "no config file specified") (\_ -> return ()) quietParameters options :: [OptDescr (Conf -> IO Conf)] options = [ Option "m" ["min-length"] (ReqArg set_lmin "LEN") "Set minimum length to LEN (25)", Option "c" ["config"] (ReqArg set_conf "FILE") "Configuiration is stored in FILE", Option "v" ["verbose"] (NoArg set_verbose) "Print progress reports", Option "h?" ["help","usage"] (NoArg disp_usage) "Print this message and exit" ] where set_lmin a c = readIO a >>= \l -> return $ c { conf_lmin = l } set_conf f c = return $ c { conf_metadata = f } set_verbose c = return $ c { conf_report = hPutStrLn stderr, conf_params = debugParameters } disp_usage _ = do pn <- getProgName let blah = "Usage: " ++ pn ++ " [OPTION...] [LIBRARY-NAME...]" putStrLn $ usageInfo blah options exitSuccess main :: IO () main = do (opts, lnames, errors) <- getOpt Permute options <$> getArgs unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure conf <- foldl (>>=) (return defaultConf) opts mapM_ (main' conf) lnames main' :: Conf -> String -> IO () main' Conf{..} lname = do [Library _ fs _] <- return . filter ((fromString lname ==) . library_name) . concatMap sample_libraries . M.elems =<< readMetadata conf_metadata -- XXX meh. subsampling from multiple files is not yet supported :( brs <- subsampleBam (takeDirectory conf_metadata unpack (head fs)) >=> run $ \_ -> joinI $ filterStream (\b -> not (isUnmapped (unpackBam b)) && G.length (b_seq (unpackBam b)) >= conf_lmin) $ joinI $ takeStream 100000 $ joinI $ mapStream pack_record $ joinI $ filterStream (\u -> U.length (U.filter (<16) (unSeq u)) * 10 >= 9 * U.length (unSeq u)) $ stream2vectorN 30000 let lmax = V.maximum $ V.map (U.length . unSeq) brs v0 = crude_estimate brs opt v = optimize conf_params 0.0001 v (VFunction $ lkfun conf_lmin lmax brs) (VGradient $ snd . combofn conf_lmin lmax brs) (Just . VCombined $ combofn conf_lmin lmax brs) results <- mapConcurrently opt [ v0, U.take 4 v0, U.take 1 v0 ] let mlk = minimum [ finalValue st | (_,_,st) <- results ] tot = sum [ exp $ mlk - finalValue st | (_,_,st) <- results ] p l = exp (mlk - l) / tot [ (p_ss, [ _, ssd_sigma_, ssd_delta_, ssd_lambda, ssd_kappa ]), (p_ds, [ _, dsd_sigma_, dsd_delta_, dsd_lambda ]), (_ , [ _ ]) ] = [ (p (finalValue st), map sigmoid2 $ G.toList xs) | (xs,_,st) <- results ] ssd_sigma = p_ss * ssd_sigma_ ssd_delta = p_ss * ssd_delta_ dsd_sigma = p_ds * dsd_sigma_ dsd_delta = p_ds * dsd_delta_ putStrLn $ "p_{ss} = " ++ show p_ss ++ ", p_{ds} = " ++ show p_ds putStrLn $ show DP{..} updateMetadata (store_dp lname DP{..}) conf_metadata -- Trying to get confidence intervals. Right now, just get the -- gradient and Hessian at the ML point. Gradient should be nearly -- zero, Hessian should be symmetric and positive definite. -- (Remember, we minimized.) mapM_ print [ (r,s) | (_,r,s) <- results ] putStrLn "" mapM_ print [ lkfun' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ] putStrLn "" mapM_ print [ lkfun'' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ] -- We'll require the MD field to be present. Then we cook each read -- into a list of paired bases. Deleted bases are dropped, inserted -- bases replaced with an escape code. -- -- XXX This is annoying... almost, but not quite the same as the code -- in the "Pileup" module. This also relies on MD and doesn't offer the -- alternative of accessing a reference genome. (The latter may not be -- worth the trouble.) It also resembles the 'ECig' logic from -- "Bio.Bam.Rmdup". pack_record :: BamRaw -> Seq pack_record br = if isReversed b then k (revcom u1) else k u1 where b@BamRec{..} = unpackBam br k | isMerged b = Merged | isTrimmed b = Merged | isSecondMate b = Mate2nd | otherwise = Mate1st revcom = U.reverse . U.map (\x -> if x > 15 then x else xor x 15) u1 = U.fromList . map unNP $ go (G.toList b_cigar) (G.toList b_seq) (fromMaybe [] $ getMd b) go :: [Cigar] -> [Nucleotides] -> [MdOp] -> [NP] go (_:*0 :cs) ns mds = go cs ns mds go cs ns (MdNum 0:mds) = go cs ns mds go cs ns (MdDel []:mds) = go cs ns mds go _ [] _ = [] go [] _ _ = [] go (Mat:*nm :cs) (n:ns) (MdNum mm:mds) = mk_pair n n : go (Mat:*(nm-1):cs) ns (MdNum (mm-1):mds) go (Mat:*nm :cs) (n:ns) (MdRep n':mds) = mk_pair n n' : go (Mat:*(nm-1):cs) ns mds go (Mat:*nm :cs) ns (MdDel _ :mds) = go (Mat:* nm :cs) ns mds go (Ins:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds go (SMa:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds go (Del:*nm :cs) ns (MdDel (_:ds):mds) = go (Del:*(nm-1):cs) ns (MdDel ds:mds) go (Del:*nm :cs) ns ( _:mds) = go (Del:* nm :cs) ns mds go (_:cs) nd mds = go cs nd mds esc :: NP esc = NP 16 mk_pair :: Nucleotides -> Nucleotides -> NP mk_pair (Ns a) = case a of 1 -> mk_pair' 0 2 -> mk_pair' 1 4 -> mk_pair' 2 8 -> mk_pair' 3 _ -> const esc where mk_pair' u (Ns b) = case b of 1 -> NP $ u .|. 0 2 -> NP $ u .|. 4 4 -> NP $ u .|. 8 8 -> NP $ u .|. 12 _ -> esc infix 7 /%/ (/%/) :: Integral a => a -> a -> Double 0 /%/ 0 = 0 a /%/ b = fromIntegral a / fromIntegral b -- Crude estimate. Need two overhang lengths, two deamination rates, -- undamaged fraction, SS/DS, substitution rate. -- -- DS or SS: look whether CT or GA is greater at 3' terminal position √ -- Left overhang length: ratio of damage at second position to first √ -- Right overang length: ratio of CT at last to snd-to-last posn √ -- + ratio of GA at last to snd-to-last posn √ -- SS rate: condition on damage on one end, compute rate at other √ -- DS rate: condition on damage, compute rate in interior √ -- substitution rate: count all substitutions not due to damage √ -- undamaged fraction: see below √ -- -- Contaminant fraction: let f5 (f3, f1) be the fraction of reads -- showing damage at the 5' end (3' end, both ends). Let a (b) be -- the probability of an endogenous reads to show damage at the 5' -- end (3' end). Let e be the fraction of endogenous reads. Then -- we have: -- -- f5 = e * a -- f3 = e * b -- f1 = e * a * b -- -- f5 * f3 / f1 = e -- -- Straight forward and easy to understand, but in practice, this method -- produces ridiculous overestimates, ridiculous underestimates, -- negative contamination rates, and general grief. It's actually -- better to start from a constant number. crude_estimate :: V.Vector Seq -> U.Vector Double crude_estimate seqs0 = U.fromList [ l_subst, l_sigma, l_delta, l_lam, l_kap ] where seqs = V.filter ((>= 10) . U.length) $ V.map unSeq seqs0 total_equals = V.sum (V.map (U.length . U.filter isNotSubst) seqs) total_substs = V.sum (V.map (U.length . U.filter isOrdinarySubst) seqs) * 6 `div` 5 l_subst = isigmoid2 $ max 0.001 $ total_substs /%/ (total_equals + total_substs) c_to_t, g_to_a, c_to_c :: Word8 c_to_t = 7 g_to_a = 8 c_to_c = 5 isNotSubst x = x < 16 && x `shiftR` 2 == x .&. 3 isOrdinarySubst x = x < 16 && x `shiftR` 2 /= x .&. 3 && x /= c_to_t && x /= g_to_a ct_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_t && dmg_omega v) seqs cc_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_c && dmg_omega v) seqs ct_at_beta = V.length $ V.filter (\v -> v U.! 1 == c_to_t && dmg_omega v) seqs cc_at_beta = V.length $ V.filter (\v -> v U.! 1 == c_to_c && dmg_omega v) seqs dmg_omega v = v U.! (l-1) == c_to_t || v U.! (l-1) == g_to_a || v U.! (l-2) == c_to_t || v U.! (l-2) == g_to_a || v U.! (l-3) == c_to_t || v U.! (l-3) == g_to_a where l = U.length v l_lam = isigmoid2 lambda lambda = min 0.9 $ max 0.1 $ (ct_at_beta * (cc_at_alpha + ct_at_alpha)) /%/ ((cc_at_beta + ct_at_beta) * ct_at_alpha) ct_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_t && dmg_alpha v) seqs cc_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_c && dmg_alpha v) seqs ct_at_psi = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_t && dmg_alpha v) seqs cc_at_psi = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_c && dmg_alpha v) seqs dmg_alpha v = v U.! 0 == c_to_t || v U.! 1 == c_to_t || v U.! 2 == c_to_t l_kap = isigmoid2 $ min 0.9 $ max 0.1 $ (ct_at_psi * (cc_at_omega+ct_at_omega)) /%/ ((cc_at_psi+ct_at_psi) * ct_at_omega) total_inner_CCs = V.sum $ V.map (U.length . U.filter (== c_to_c) . takeInner) seqs total_inner_CTs = V.sum $ V.map (U.length . U.filter (== c_to_t) . takeInner) seqs takeInner v = U.slice 5 (U.length v - 10) v delta = (total_inner_CTs /%/ (total_inner_CTs+total_inner_CCs)) raw_rate = ct_at_alpha /%/ (ct_at_alpha + cc_at_alpha) -- clamping is necessary if f_endo ends up wrong l_delta = isigmoid2 $ min 0.99 delta l_sigma = isigmoid2 . min 0.99 $ raw_rate / lambda class Memorable a where type Memo a :: * fromListN :: Int -> [a] -> Memo a bang :: Memo a -> Int -> a instance Memorable Double where type Memo Double = U.Vector Double fromListN = U.fromListN bang = (U.!) instance Memorable AD where type Memo AD = (Int, U.Vector Double) fromListN _ [ ] = error "unexpected: tried to memorize an empty list" fromListN _ (C _ :_) = error "unexpected: tried to memorize a value without derivatives" fromListN n xs@(D _ v:_) = (1+d, U.fromListN (n * (1+d)) $ concatMap unp xs) where !d = U.length v unp (C a) = a : replicate d 0 unp (D a da) = a : U.toList da bang (d, v) i = D (v U.! (d*i+0)) (U.slice (d*i+1) (d-1) v) instance Memorable AD2 where type Memo AD2 = (Int, U.Vector Double) fromListN _ [ ] = error "unexpected: tried to memorize an empty list" fromListN _ (C2 _ : _) = error "unexpected: tried to memorize a value without derivatives" fromListN n xs@(D2 _ v _ : _) = (d, U.fromListN (n * (1+d+d*d)) $ concatMap unp xs) where !d = U.length v unp (C2 a) = a : replicate (d+d*d) 0 unp (D2 a da dda) = a : U.toList da ++ U.toList dda bang (d, v) i = D2 (v U.! (stride*i)) (U.slice (stride*i+1) d v) (U.slice (stride*i+1+d) (d*d) v) where stride = 1 + d + d*d store_dp :: String -> DamageParameters Double -> Metadata -> Metadata store_dp lname dp = M.map go1 where go1 (Sample ls af bf ts dv) = Sample (map go2 ls) af bf ts dv go2 (Library nm fs dmg) | nm == fromString lname = Library nm fs (Just dp) | otherwise = Library nm fs dmg