{-# LANGUAGE OverloadedStrings, BangPatterns, RecordWildCards #-} -- Simple Mitochondrial Contamination Check on BAM files. -- -- This is based on Ye Olde Contamination Check for the Neanderthal -- genome; the method is the same (and will continue to not work on -- modern humans), but simplified and sanitized. Differences from -- before: -- -- * We use the alignment from the BAM file as is. Earlier we would -- have created *two* new alignments. That is silly, however. Two -- new alignments should not be followed by bean counting, but by an -- attempt to genotype both the sample and the contaminant. -- -- * Before, the sample and contaminant sequences were fixed. Now we -- instead input a list of the diagnostic positions. Instead of an -- explicit list, the two sequences can still be used, or only the -- contaminant can be supplied while the sample is genotype-called. -- TODO -- -- (1) Given a list of diagnostic positions, implement the contamination -- check. Structure of the code can be stolen from ccheck in the -- mia package. -- -- - What do we do about wrapped alignments? Mia has f/b/a labels, -- BAM doesn't. We can see if it overhangs, though. -- -- (2) Given a high-coverage sample, genotype call it and derive the -- diagnostic positions. -- -- - This method needs some definition of the contaminant consensus -- thingy. -- -- (3) Given a `correct' sample sequence, align it to the reference and -- derive diagnostic positions from that. -- -- - Needs the same description of the contaminant thingy. -- -- (4) Consider Read Groups. -- -- - One result per read group (or maybe per library, alternatively -- per file) should be produced. -- - The "aDNA" setting should be determined from either the @RG -- header or from an external source. import Bio.Base import Bio.Bam hiding ( Unknown ) import Control.Applicative import Control.Monad import Data.Bits import Data.Monoid import Data.List import Numeric import System.Console.GetOpt import System.Environment import System.Exit import System.IO import qualified Data.HashMap.Strict as HM import qualified Data.IntMap as IM data Conf = Conf { conf_adna :: Adna, conf_verbosity :: Int, conf_header :: HeaderFn, conf_output :: OutputFn, conf_shoot_foot :: Bool, conf_dp_list :: DpList } options :: [OptDescr (Conf -> IO Conf)] options = public_options ++ hidden_options where public_options = [ Option "a" ["ancient","dsprot"] (NoArg (set_adna ancientDNAds)) "Treat DNA as ancient, double strand protocol", Option "s" ["ssprot"] (NoArg (set_adna ancientDNAss)) "Treat DNA as ancient, single strand protocol", Option "" ["fresh"] (NoArg (set_adna freshDNA)) "Treat DNA as fresh (not ancient)", Option "T" ["table"] (NoArg set_output_table) "Print output in table form", Option "v" ["verbose"] (NoArg (mod_verbosity succ)) "Produce more debug output", Option "q" ["quiet"] (NoArg (mod_verbosity pred)) "Produce less debug output", Option "h?" ["help","usage"] (NoArg usage) "Print this message and exit" ] hidden_options = [ Option "" ["shoot","foot"] (NoArg set_shoot_foot) [] ] usage _ = do pn <- getProgName hPutStrLn stderr $ usageInfo ("Usage: " ++ pn ++ " [OPTION...] [Bam-File...]") public_options exitSuccess set_shoot_foot c = return $ c { conf_shoot_foot = True } set_adna a c = return $ c { conf_adna = a } set_output_table c = return $ c { conf_output = show_result_table, conf_header = header_table } mod_verbosity f c = return $ c { conf_verbosity = f (conf_verbosity c) } conf0 :: IO Conf conf0 = return $ Conf { conf_adna = freshDNA , conf_verbosity = 1 , conf_header = "" , conf_output = show_result_plain , conf_shoot_foot = False , conf_dp_list = error "no diagnostic positions defined" } {- Old options... may or may not be of much use. struct option longopts[] = { { "reference", required_argument, 0, 'r' }, { "transversions", no_argument, 0, 't' }, { "span", required_argument, 0, 's' }, { "maxd", required_argument, 0, 'd' }, } ; void usage( const char* pname ) { "Reads a maln file and tries to quantify contained contamination.\n" "Options:\n" " -r, --reference FILE FASTA file with the likely contaminant (default: builtin mt311)\n" " -t, --transversions Treat only transversions as diagnostic\n" " -s, --span M-N Look only at range from M to N\n" " -n, --numpos N Require N diagnostic sites in a single read (default: 1)\n" } -} -- | A list of diagnostic positions. We drop the failed idea of -- "weakly diagnostic positions". We also work in the coordinate system -- of the reference. Therefore, a diagnostic position is defined by -- position, allele in the clean sample and allele in the contaminant. data Dp = Dp { _dp_clean_allele :: !Nucleotide , _dp_dirty_allele :: !Nucleotide } deriving Show type DpList = IM.IntMap Dp show_dp_list :: DpList -> ShowS show_dp_list = flip IM.foldrWithKey id $ \pos (Dp cln drt) k -> (:) '<' . shows pos . (:) ':' . shows drt . (:) ',' . shows cln . (++) ">, " . k -- | Reads are classified into one of these. data Klass = Unknown | Clean | Dirty | Conflict | Nonsense deriving (Ord, Eq, Enum, Bounded, Show) instance Monoid Klass where mempty = Unknown Clean `mappend` Dirty = Conflict Dirty `mappend` Clean = Conflict x `mappend` y = if x < y then y else x newtype Summary = Summary (IM.IntMap Int) sum_count :: Klass -> Summary -> Summary sum_count kl (Summary m) = Summary $ IM.insertWith' (+) (fromEnum kl) 1 m sum_get :: Klass -> Summary -> Int sum_get kl (Summary m) = IM.findWithDefault 0 (fromEnum kl) m -- | Determines what an allele could come from. Does not take -- port-mortem modifications into account. classify :: Dp -> Nucleotide -> Klass classify (Dp cln drt) nuc | maybe_clean && maybe_dirty = Unknown | maybe_clean = Clean | maybe_dirty = Dirty | otherwise = Nonsense where maybe_clean = unN cln .&. unN nuc /= 0 maybe_dirty = unN drt .&. unN nuc /= 0 -- | We deal with aDNA by transforming a base into all the bases it -- could have been. So the configuration is simply the transformation -- function. type Adna = Nucleotide -> Nucleotide -- | Fresh DNA: no transformation. freshDNA :: Adna freshDNA = id -- | Ancient DNA, single strand protocol. Deamination can turn C into T -- only. ancientDNAss :: Adna ancientDNAss = N . app . unN where app x = if x .&. unN nucT /= 0 then x .|. unN nucC else x -- | Ancient DNA, double strand protocol. Deamination can turn C into T -- and G into A. ancientDNAds :: Adna ancientDNAds = N . app1 . app2 . unN where app1 x = if x .&. unN nucT /= 0 then x .|. unN nucC else x app2 x = if x .&. unN nucA /= 0 then x .|. unN nucG else x -- | Classifying a read. In an ideal world, we'd be looking at a single -- read mapped in one piece. Instead, we may be looking at half a mate -- pair or even a single read mapped inconveniently across the origin. -- -- We will be reading a BAM stream. All reads with the same name (there -- maybe 1..4, assuming no major breakage) need to be processed -- together. We'll isolate that here: our input stream consists of -- reads that all have the same qname. Results in exactly one 'Klass'. -- We will ignore mate pairs that are improperly mapped or filtered. -- -- May need more options. Note that application of the aDNA function -- depends on the strandedness of the alignment. FIXME -- -- This is the only place where counting of votes was used before, and -- only for debugging purposes. Everything that was either dirty or -- clean (but not both) counted as a vote. classify_read_set :: Monad m => DpList -> Adna -> Iteratee [BamRaw] m Klass classify_read_set = undefined -- | Classifying a stream. We create a map from read name to iteratee. -- New names are inserted, known names fed to stored iteratees. -- ``Done'' iteratees are disposed of immediately. classify_stream :: Monad m => DpList -> Adna -> Iteratee [BamRaw] m Summary classify_stream dps adna = foldStreamM classify_read (Summary IM.empty, HM.empty) >>= lift . finish where classify0 = classify_read_set dps adna classify_read (summary, iters) rd = do let nm = b_qname $ unpackBam rd let it = HM.lookupDefault classify0 nm iters (isdone, it') <- enumPure1Chunk [rd] it >>= enumCheckIfDone if isdone then do cl <- run it' return (sum_count cl summary, HM.delete nm iters) else return (summary, HM.insert nm it' iters) finish (summary, iters) = foldM (\s it -> flip sum_count s `liftM` run it) summary $ HM.elems iters {- Missing from the output right now: * filename (library would be better) * alignment distance (only useful if DPs are derived from alignment) * number of difference (likewise) * number of DPs * number of DPs which are transversions -} result_labels :: [ String ] result_labels = [ "unclassified", "clean", "polluting", "conflicting", "nonsensical", "LB", "ML", "UB" ] type HeaderFn = String type OutputFn = Summary -> Maybe [Double] -> String show_result_plain :: OutputFn show_result_plain summary ests = unlines $ zipWith fmt result_labels [minBound..maxBound] ++ [[]] where labellen = (+) 2 . maximum . map length $ zipWith const result_labels [minBound..maxBound::Klass] pad n s = replicate (n - length s) ' ' ++ s fmt lbl kl = pad labellen lbl ++ " fragments: " ++ show (sum_get kl summary) ++ if kl == Dirty then maybe [] fmt_ests ests else [] fmt_ests [lb,ml,ub] = " (" ++ showFFloat (Just 1) lb " .. " ++ showFFloat (Just 1) ml " .. " ++ showFFloat (Just 1) ub "%)" header_table :: HeaderFn header_table = intercalate "\t" result_labels show_result_table :: OutputFn show_result_table summary ests = intercalate "\t" $ [ show $ sum_get kl summary | kl <- [minBound..maxBound] ] ++ maybe (replicate 3 "N/A") (map (\x -> showFFloat (Just 1) x [])) ests show_result_with :: (Summary -> Maybe [Double] -> a) -> Summary -> a show_result_with f summary = f summary (if nn /= 0 then Just [lb,ml,ub] else Nothing) where z = 1.96 -- this is Z_{0.975}, giving a 95% confidence interval k = fromIntegral (sum_get Dirty summary) n = k + fromIntegral (sum_get Clean summary) nn = sum_get Dirty summary + sum_get Clean summary p_ = k / n c = p_ + 0.5 * z * z / n w = z * sqrt( p_ * (1-p_) / n + 0.25 * z * z / (n*n) ) d = 1 + z * z / n lb = max 0 $ 100 * (c-w) / d -- lower bound of CI ml = 100 * p_ -- ML estimate ub = min 100 $ 100 * (c+w) / d -- upper bound of CI -- The following is old 'ccheck'... for reference and guidance. {- /* * Contamination Checker. Outline: * * - read the human reference (concsensus of contaminants); this will * contain ambiguity codes * - read maln file, including assembly and assembled reads * - align contaminant-consensus and assembly globally * This uses Myers' O(nd) aligner, for it grasps ambiguity codes and * runs fast enough, in little memory, for long, but similar * sequences. * - find "strongly diagnostic positions", positions where ass and con * are incompatible, and "weakly diagnostic positions", positions * where ass and con are not always equal * - for every "end" fragment: store it and later join with its other * half to give an effectively "full" fragment * - for every "full" fragment: if it crosses at least one (strongly or * weakly) diagnostic position, cut out that range from ref and align * to it globally using the mia aligner * - pass 1: for every weakly diagnostic position where the bases agree, * store whether a contaminant was discovered, and if so, turn them * into "actually diagnostic positions". * - pass 2: for every (strongly or actually) diagnostic position where * the bases agree, classify it, then classify the fragment * (conflicting, uninformative, contaminant, endogenous) * - produce a summary * * Notable features: * - operates sensibly on aDNA * - has sensible commandline and doesn't make too much noise in operation * - optionally considers only certain diagnostic positions * (tranversions only and/or some region only) * - new consensus sequence has other letters besides N */ // Everything that differs is weakly diagnostic, unless it's a gap. // Note that this mean that Ns are usually weakly diagnostic. bool is_diagnostic( char aln1, char aln2 ) { return aln1 != '-' && aln2 != '-' && toupper(aln1) != toupper(aln2) ; } // Interesting question... given ambiguity codes, what's a transversion? // One way to put it: anything that is incompatible with all four // transitions. Needs a different implementation. bool is_transversion( char a, char b ) { char u = a & ~32 ; char v = b & ~32 ; switch( u ) { case 'A': return v != 'G' ; case 'C': return v != 'T' ; case 'G': return v != 'A' ; case 'T': case 'U': return v != 'C' ; default: return false ; } } dp_list mk_dp_list( const char* aln1, const char* aln2, int span_from, int span_to ) { dp_list l ; int index = 0 ; while( index != span_from && *aln1 && *aln2 ) { if( *aln2 != '-' ) ++index ; ++aln1 ; ++aln2 ; } while( index != span_to && *aln1 && *aln2 ) { if( is_diagnostic( *aln1, *aln2 ) ) { l[index].consensus = *aln1 ; l[index].assembly = *aln2 ; } if( *aln2 != '-' ) ++index ; ++aln1 ; ++aln2 ; } return l ; } -} -- We won't keep this. Mt311 should be stored as half a Dp list. -- extern char mt311_sequence[] ; -- extern const int mt311_sequence_size ; main :: IO () main = do (opts, files, errors) <- getOpt Permute options <$> getArgs unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure Conf{..} <- foldl (>>=) conf0 opts {- bool transversions = false ; int min_diag_posns = 1 ; int maxd = 0 ; int span_from = 0, span_to = INT_MAX ; int opt ; do { opt = getopt_long( argc, argv, "r:avhts:d:n:MfTF", longopts, 0 ) ; switch( opt ) { case 'r': read_fasta_ref( &hum_ref, optarg ) ; break ; case 't': transversions = true ; break ; case 's': sscanf( optarg, "%u-%u", &span_from, &span_to ) ; if( span_from ) span_from-- ; break ; case 'n': min_diag_posns = atoi( optarg ) ; break ; case 'd': maxd = atoi( optarg ) ; break ; } } while( opt != -1 ) ; -} when (IM.size conf_dp_list < 40 && not conf_shoot_foot) $ do hPutStrLn stderr $ "\n *** Low number (" ++ shows (IM.size conf_dp_list) ") of diagnostic positions found.\n\ \ *** I will stop now for your own safety.\n\ \ *** If you are sure you want to shoot yourself\n\ \ *** in the foot, read the man page to learn\n\ \ *** how to lift this restriction.\n\n" exitFailure -- TODO We will usually want to seek to the mitochondrion, which -- doesn't work with the simple 'mergeInputs' invocation. r <- mergeInputs combineCoordinates files >=> run $ \hdr -> classify_stream conf_dp_list conf_adna putStrLn $ unlines $ conf_header : show_result_with conf_output r : [] {- if( mktable ) { fputs( infile.c_str(), stdout ) ; putchar( '\t' ) ; } else { puts( infile.c_str() ) ; putchar( '\n' ) ; } -} -- if( !maxd ) maxd = max( strlen(hum_ref.seq), strlen(maln->ref->seq) ) / 10 ; -- char *aln_con = (char*)malloc( strlen(hum_ref.seq) + maxd + 2 ) ; -- char *aln_ass = (char*)malloc( strlen(maln->ref->seq) + maxd + 2 ) ; -- unsigned d = myers_diff( hum_ref.seq, myers_align_globally, maln->ref->seq, maxd, aln_con, aln_ass ) ; {- if( d == UINT_MAX ) { fprintf( stderr, "\n *** Could not align references with up to %d mismatches.\n" " *** This is usually a sign of trouble, but\n" " *** IF AND ONLY IF YOU KNOW WHAT YOU ARE DOING, you can\n" " *** try the -d N option with N > %d.\n\n", maxd, maxd ) ; return 1 ; } if( mktable ) printf( "%d\t", d ) ; else printf( " %d alignment distance between reference and assembly.\n", d ) ; if( verbose >= 6 ) print_aln( aln_con, aln_ass ) ; dp_list l = mk_dp_list( aln_con, aln_ass, span_from, span_to ) ; if( mktable ) printf( "%u\t", (unsigned)l.size() ) ; else printf( " %u total differences between reference and assembly.\n", (unsigned)l.size() ) ; int num_strong = 0 ; for( dp_list::const_iterator i = l.begin() ; i != l.end() ; ++i ) if( i->second.strength > weak ) ++num_strong ; if( mktable ) printf( "%d\t", (int)l.size() ) ; else { printf( " %d diagnostic positions", (int)l.size() ) ; if( span_from != 0 || span_to != INT_MAX ) printf( " in range [%d,%d)", span_from, span_to ) ; printf( ", %d of which are strongly diagnostic.\n", num_strong ) ; } if( verbose >= 3 ) { print_dp_list( stderr, l.begin(), l.end(), '\n', 0 ) ; print_dp_list( stderr, l.begin(), l.end(), '\n', 1 ) ; } -} {- if( verbose >= 2 ) fputs( "Pass one: finding actually diagnostic positions.\n", stderr ) ; for( const AlnSeqP *s = maln->AlnSeqArray ; s != maln->AlnSeqArray + maln->num_aln_seqs ; ++s ) { fixup_name( *s ) ; std::string the_ass( maln->ref->seq + (*s)->start, (*s)->end - (*s)->start + 1 ) ; // are we overlapping anything at all? std::pair< dp_list::const_iterator, dp_list::const_iterator > p = overlapped_diagnostic_positions( l, *s ) ; if( verbose >= 3 ) { fprintf( stderr, "%s/%c:\n %d potentially diagnostic positions", (*s)->id, (*s)->segment, (int)std::distance( p.first, p.second ) ) ; if( verbose >= 4 ) { putc( ':', stderr ) ; putc( ' ', stderr ) ; print_dp_list( stderr, p.first, p.second, 0 ) ; } fprintf( stderr, "; range: %d..%d\n", (*s)->start, (*s)->end ) ; } -} {- int t = 0 ; for( dp_list::const_iterator i = l.begin() ; i != l.end() ; ++i ) if( is_transversion( i->second.consensus, i->second.assembly ) ) ++t ; if( mktable ) printf( "%d\t%d\t", t, num_strong ) ; else { printf( " %d effectively diagnostic positions", (int)l.size() ) ; if( span_from != 0 || span_to != INT_MAX ) printf( " in range [%d,%d)", span_from, span_to ) ; printf( ", %d of which are transversions.\n\n", t ) ; } if( verbose >= 3 ) print_dp_list( stderr, l.begin(), l.end(), '\n' ) ; std::deque< cached_pwaln >::const_iterator cpwaln = cached_pwalns.begin() ; for( const AlnSeqP *s = maln->AlnSeqArray ; s != maln->AlnSeqArray + maln->num_aln_seqs ; ++s, ++cpwaln ) { whatsit klass = unknown ; int votes = 0, votes2 = 0 ; std::string the_ass( maln->ref->seq + (*s)->start, (*s)->end - (*s)->start + 1 ) ; // enough overlap? (we only have _actually_ diagnostic positions now) std::pair< dp_list::const_iterator, dp_list::const_iterator > p = overlapped_diagnostic_positions( l, *s ) ; if( std::distance( p.first, p.second ) < min_diag_posns ) { if( verbose >= 3 ) { fputs( (*s)->id, stderr ) ; putc( '/', stderr ) ; putc( (*s)->segment, stderr ) ; fputs( ": no diagnostic positions\n", stderr ) ; } } else { if( verbose >= 3 ) { fprintf( stderr, "%s/%c: %d diagnostic positions", (*s)->id, (*s)->segment, (int)std::distance( p.first, p.second ) ) ; if( verbose >= 4 ) { putc( ':', stderr ) ; putc( ' ', stderr ) ; print_dp_list( stderr, p.first, p.second, 0 ) ; } fprintf( stderr, "; range: %d..%d\n", (*s)->start, (*s)->end ) ; } // Hmm, all this iterator business is somewhat lacking... char *paln1 = aln_con, *paln2 = aln_ass ; int ass_pos = 0 ; while( ass_pos != (*s)->start && *paln1 && *paln2 ) { if( *paln2 != '-' ) ass_pos++ ; ++paln1 ; ++paln2 ; } char *in_ass = maln->ref->seq + (*s)->start ; char *in_frag_v_ass = (*s)->seq ; std::string::const_iterator in_frag_v_ref = cpwaln->frag_seq.begin() ; std::string lifted = lift_over( aln_con, aln_ass, (*s)->start, (*s)->end + 1 ) ; std::string in_ref = lifted.substr( 0, cpwaln->start ) ; in_ref.append( cpwaln->ref_seq ) ; while( ass_pos != (*s)->end +1 && *paln1 && *paln2 && !in_ref.empty() && *in_ass && *in_frag_v_ass && *in_frag_v_ref ) { if( *paln1 != '-' ) { do { in_ref=in_ref.substr(1) ; in_frag_v_ref++ ; } while( in_ref[0] == '-' ) ; } if( *paln2 != '-' ) { ass_pos++ ; do { in_ass++ ; in_frag_v_ass++ ; } while( *in_ass == '-' ) ; } ++paln1 ; ++paln2 ; } if( verbose >= 4 ) putc( '\n', stderr ) ; } } } } -}