{-# LANGUAGE RecordWildCards, BangPatterns, OverloadedStrings, FlexibleContexts #-} -- Command line driver for simple genotype calling. We have three -- separate steps: Pileup from a BAM file (or multiple merged files) to -- produce likelihoods (and some auxillary statistics). These are -- written into an Avro container. Next we need to estimate parameters, -- in the simplest case divergence and heterozygosity. We can save some -- time by fusing this with the first step. The final step is calling -- bases by scnaning the Avro container and applying some model, and -- again, in the simplest case that's just divergence and -- heterozygosity. We keep that separate, because different models will -- require different programs. So here we produce likelihoods and -- a simple model fit. -- The likelihoods depend on damage parameters and an error model, -- otherwise they are 'eternal'. (For the time being, it's probably -- wise to go with the naïve error model.) Technically, they also -- depend on ploidy, but since only diploid organisms are interesting -- right now, we fix that to two. We pay some overhead on the sex -- chromosomes, but the simplification is worth it. -- About damage parameters: We effectively have three different models -- (SS, DS, no damage) and it may not be possible to choose one a -- priori. To manage this cleanly, we should have one universal model, -- but the three we have are not generalizations of each other. -- However, all can be generalized into one model with slightly more -- parameters. See tools/dmg-est.hs for how we fit the model. -- Calling is always diploid, for maximum flexibility. We don't really -- support higher ploidies, so the worst damage is that we output an -- overhead of 150% useless likelihood values for the sex chromosomes -- and maybe estimate heterozygosity where there is none. import Bio.Base import Bio.Bam.Header import Bio.Bam.Index import Bio.Bam.Pileup import Bio.Bam.Reader import Bio.Bam.Rec import Bio.Genocall import Bio.Genocall.Adna import Bio.Genocall.AvroFile import Bio.Genocall.Metadata import Bio.Iteratee import Control.Applicative import Control.Monad import Data.Aeson import Data.Avro import Data.String ( fromString ) import Data.Vec.Packed ( packMat ) import System.Console.GetOpt import System.Directory ( renameFile ) import System.Environment import System.Exit import System.FilePath import System.IO import qualified Data.ByteString.Char8 as S import qualified Data.ByteString.Lazy as BL import qualified Data.Foldable as F import qualified Data.HashMap.Strict as H import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Vector.Storable as VS import qualified Data.Vector as V import qualified Data.Vector.Unboxed as U import qualified Data.Vector.Unboxed.Mutable as M import qualified Data.Sequence as Z data Conf = Conf { -- Generator for output file name. Receives sample name and -- (optional) region as arguments. conf_output :: String -> Maybe String -> FilePath, conf_metadata :: FilePath, conf_theta :: Maybe Double, conf_report :: String -> IO () } defaultConf :: Conf defaultConf = Conf default_out (error "no metadata file specified") Nothing (\_ -> return ()) where default_out smp Nothing = smp <> ".av" default_out smp (Just rgn) = smp <> "-" <> rgn <> ".av" options :: [OptDescr (Conf -> IO Conf)] options = [ Option "c" ["config"] (ReqArg set_conf "FILE") "Set name of json config file to FILE", Option "o" ["output"] (ReqArg set_output "FILE") "Set out file schema to FILE", Option "t" dep_param (ReqArg set_theta "FRAC") "Set dependency coefficient to FRAC (\"N\" to turn off)", Option "v" ["verbose"] (NoArg be_verbose) "Print more diagnostics", Option "h?" ["help","usage"] (NoArg disp_usage) "Display this message" ] where dep_param = ["theta","dependency-coefficient"] disp_usage _ = do pn <- getProgName let blah = "Usage: " ++ pn ++ " [OPTION...] [SAMPLE [REGION...] ...]" putStrLn $ usageInfo blah options exitSuccess be_verbose c = return $ c { conf_report = hPutStrLn stderr } set_conf fn c = return $ c { conf_metadata = fn } set_theta "N" c = return $ c { conf_theta = Nothing } set_theta a c = (\t -> c { conf_theta = Just t }) <$> readIO a set_output fn c = return $ c { conf_output = mkoutput fn } mkoutput :: FilePath -> String -> Maybe String -> FilePath mkoutput str smp rgn = go str where go ('%':'s':s) = smp ++ go s go ('%':'r':s) = maybe id (++) rgn $ go s go ('%':'%':s) = '%' : go s go ('%': c :s) = c : go s go ( c :s) = c : go s go [ ] = [ ] main :: IO () main = do (opts, samprgns, errs) <- getOpt Permute options <$> getArgs Conf{..} <- foldl (>>=) (return defaultConf) opts unless (null errs) $ mapM_ (hPutStrLn stderr) errs >> exitFailure -- samprgns contains samples and regions. We define anything found in the metadata as sample, -- anything else as region to apply to the previous sample. Name a sample "chr1" and you get -- what you deserve. samples <- flip split_sam_rgns samprgns <$> readMetadata conf_metadata when (null samples) $ hPutStrLn stderr "need (at least) one sample name" >> exitFailure forM_ samples $ \(sample, rgns) -> do meta <- readMetadata conf_metadata case H.lookup (fromString sample) meta of Nothing -> hPutStrLn stderr $ "unknown sample " ++ show sample Just smp -> forM_ rgns $ \rgn -> do let outstem = conf_output sample rgn outfile = takeDirectory conf_metadata outstem tmpfile = outfile ++ ".#" (tab,()) <- withFile tmpfile WriteMode $ \ohdl -> mergeLibraries conf_report conf_metadata (sample_libraries smp) rgn >=> run $ \hdr -> progressPos (\(rs, p, _) -> (rs, p)) "GT call at " conf_report (meta_refs hdr) =$ pileup =$ mapStream (calls conf_theta) =$ zipStreams tabulateSingle (output_avro ohdl $ meta_refs hdr) let upd_sample s = s { sample_div_tables = H.insert (maybe T.empty T.pack rgn) tab (sample_div_tables s) , sample_avro_files = H.insert (maybe T.empty T.pack rgn) (fromString outstem) (sample_avro_files s) } updateMetadata (H.adjust upd_sample (fromString sample)) conf_metadata renameFile tmpfile outfile mergeLibraries :: (MonadIO m, MonadMask m) => (String -> IO ()) -> FilePath -> [Library] -> Maybe String -> Enumerator' BamMeta [PosPrimChunks] m b mergeLibraries report cfg [ l ] mrgn = enumLibrary report cfg l mrgn mergeLibraries report cfg (l:ls) mrgn = mergeEnums' (mergeLibraries report cfg ls mrgn) (enumLibrary report cfg l mrgn) mm where mm _ = mergeSortStreams $ \(rs1, p1, _) (rs2, p2, _) -> if (rs1, p1) < (rs2, p2) then Less else NotLess enumLibrary :: (MonadIO m, MonadMask m) => (String -> IO ()) -> FilePath -> Library -> Maybe String -> Enumerator' BamMeta [PosPrimChunks] m b enumLibrary report cfg (Library nm fs mdp) mrgn output = do let (msg, dmg) = case mdp of Nothing -> ("no damage model", noDamage) Just dp -> ("universal damage parameters" ++ show dp, univDamage dp) liftIO . report $ "using " ++ msg ++ " for " ++ T.unpack nm mergeInputRgns mrgn combineCoordinates (map (() (takeDirectory cfg) . T.unpack) fs) $== takeWhileE (isValidRefseq . b_rname . unpackBam) $== mapMaybeStream (\br -> let b = unpackBam br m = dmg (isReversed b) (VS.length (b_qual b)) in decompose (map packMat $ V.toList m) br) $ output mergeInputRgns :: (MonadIO m, MonadMask m) => Maybe String -> (BamMeta -> Enumeratee [BamRaw] [BamRaw] (Iteratee [BamRaw] m) a) -> [FilePath] -> Enumerator' BamMeta [BamRaw] m a mergeInputRgns _ _ [ ] = \k -> return (k mempty) mergeInputRgns Nothing (?) fps = mergeInputs (?) fps mergeInputRgns (Just rs) (?) (fp0:fps0) = go fp0 fps0 where enum1 fp k1 = do idx <- liftIO $ readBamIndex fp enumFileRandom defaultBufSize fp >=> run >=> run $ decodeAnyBam $ \hdr -> let Just ri = Z.findIndexL ((==) rs . unpackSeqid . sq_name) (meta_refs hdr) in eneeBamRefseq idx (Refseq $ fromIntegral ri) $ k1 hdr go fp [ ] = enum1 fp go fp (fp1:fps) = mergeEnums' (go fp1 fps) (enum1 fp) (?) -- | Ploidy is hardcoded as two here. Can be changed if the need -- arises. -- -- XXX For the time being, forward and reverse piles get concatenated. -- For the naive call, this doesn't matter. For the MAQ call, it feels -- more correct to treat them separately and multiply (add?) the results. calls :: Maybe Double -> Pile -> Calls calls Nothing pile = pile { p_snp_pile = s, p_indel_pile = i } where !s = simple_snp_call fq 2 $ uncurry (++) $ p_snp_pile pile !i = simple_indel_call 2 $ p_indel_pile pile -- XXX this should be a cmdline option -- fq = min 1 . (*) 1.333 . fromQual fq = fromQual calls (Just theta) pile = pile { p_snp_pile = s, p_indel_pile = i } where !i = simple_indel_call 2 $ p_indel_pile pile -- This lumps the two strands together -- !s = maq_snp_call 2 theta $ uncurry (++) $ p_snp_pile pile -- XXX -- This treats them separately !s | r == r' = Snp_GLs (U.zipWith (*) x y) r -- same ref base (normal case): multiply | r == nucsN = Snp_GLs y r' -- forward ref is N, use backward call | otherwise = Snp_GLs x r -- else use forward call (even if this is incorrect, where -- there is nothing else we can do here) Snp_GLs x r = maq_snp_call 2 theta $ fst $ p_snp_pile pile Snp_GLs y r' = maq_snp_call 2 theta $ snd $ p_snp_pile pile -- | Serialize the results from genotype calling in a sensible way. We -- write an Avro file, but we add another blocking layer on top so we -- don't need to endlessly repeat coordinates. compileBlocks :: Monad m => Enumeratee [Calls] [GenoCallBlock] m a compileBlocks = convStream $ do c1 <- headStream tailBlock (p_refseq c1) (p_pos c1) (p_pos c1) . (:[]) $! pack c1 where tailBlock !rs !p0 !po acc = do mc <- peekStream case mc of Just c1 | rs == p_refseq c1 && po+1 == p_pos c1 && po - p0 < 65536 -> do _ <- headStream tailBlock rs p0 (po+1) . (:acc) $! pack c1 _ -> return [ GenoCallBlock { reference_name = rs , start_position = p0 , called_sites = reverse acc } ] pack c1 = rlist indel_variants `seq` GenoCallSite{..} where Snp_GLs snp_pls !ref_allele = p_snp_pile c1 !snp_stats = p_snp_stat c1 !indel_stats = p_indel_stat c1 !snp_likelihoods = compact_likelihoods snp_pls !indel_likelihoods = compact_likelihoods $ fst $ p_indel_pile c1 !indel_variants = snd $ p_indel_pile c1 rlist [] = () rlist (x:xs) = x `seq` rlist xs output_avro :: Handle -> Refs -> Iteratee [Calls] IO () output_avro hdl refs = compileBlocks =$ writeAvroContainer ContainerOpts{..} =$ mapChunksM_ (S.hPut hdl) where objects_per_block = 16 filetype_label = "Genotype Likelihoods V0.1" initial_schemas = H.singleton "Refseq" $ object [ "type" .= String "enum" , "name" .= String "Refseq" , "symbols" .= Array (V.fromList . map (String . T.decodeUtf8 . sq_name) $ F.toList refs) ] meta_info = H.singleton "biohazard.refseq_length" $ S.concat $ BL.toChunks $ encode $ Array $ V.fromList [ Number (fromIntegral (sq_length s)) | s <- F.toList refs ] maxD :: Int maxD = 64 -- | Parameter estimation for a single sample. The parameters are -- divergence and heterozygosity. We tabulate the data here and do the -- estimation afterwards. Returns the product of the -- parameter-independent parts of the likehoods and the histogram -- indexed by D and H (see @genotyping.pdf@ for details). tabulateSingle :: (Functor m, MonadIO m) => Iteratee [Calls] m (Double, U.Vector Int) tabulateSingle = do tab <- liftIO $ M.replicate (12 * maxD * maxD) (0 :: Int) (,) <$> foldStreamM (\acc -> accum tab acc . p_snp_pile) (0 :: Double) <*> liftIO (U.unsafeFreeze tab) where -- We need GL values for the invariant, the three homozygous variant -- and the three single-event heterozygous variant cases. The -- ordering is like in BCF, with the reference first. -- Ref ~ A ==> PL ~ AA, AC, CC, AG, CG, GG, AT, CT, GT, TT {-# INLINE accum #-} accum !tab !acc (Snp_GLs !gls !ref) | U.length gls /= 10 = error "Ten GL values expected for SNP!" -- should not happen | ref `elem` [nucsC,nucsG] = accum' 0 tab acc gls | ref `elem` [nucsA,nucsT] = accum' 6 tab acc gls | otherwise = return acc -- unknown reference -- The simple 2D table didn't work, it lacked resolution in some -- cases. We make six separate tables instead so we can store two -- differences with good resolution in every case. {-# INLINE accum' #-} accum' refix !tab !acc !gls | g_RR >= g_RA && g_RA >= g_AA = store 0 g_RR g_RA g_AA | g_RR >= g_AA && g_AA >= g_RA = store 1 g_RR g_AA g_RA | g_RA >= g_RR && g_RR >= g_AA = store 2 g_RA g_RR g_AA | g_RA >= g_AA && g_AA >= g_RR = store 3 g_RA g_AA g_RR | g_RR >= g_RA = store 4 g_AA g_RR g_RA | otherwise = store 5 g_AA g_RA g_RR where g_RR = unPr $ U.unsafeIndex gls 0 g_RA = unPr $ (U.unsafeIndex gls 1 + U.unsafeIndex gls 3 + U.unsafeIndex gls 6) / 3 g_AA = unPr $ (U.unsafeIndex gls 2 + U.unsafeIndex gls 5 + U.unsafeIndex gls 9) / 3 store t a b c = do let d1 = min (maxD-1) . round $ a - b d2 = min (maxD-1) . round $ b - c ix = (t + refix) * maxD * maxD + d1 * maxD + d2 liftIO $ M.read tab ix >>= M.write tab ix . succ return $! acc + a