{-# LANGUAGE OverloadedStrings, BangPatterns, RecordWildCards #-} -- Simple Mitochondrial Contamination Check on BAM files. -- -- This is based on Ye Olde Contamination Check for the Neanderthal -- genome; the method is the same (and will continue to not work on -- modern humans), but simplified and sanitized. Differences from -- before: -- -- * We use the alignment from the BAM file as is. Earlier we would -- have created *two* new alignments. That is silly, however. Two -- new alignments should not be followed by bean counting, but by an -- attempt to genotype both the sample and the contaminant. -- -- * Before, the sample and contaminant sequences were fixed. Now we -- instead input a list of the diagnostic positions. Instead of an -- explicit list, the two sequences can still be used, or only the -- contaminant can be supplied while the sample is genotype-called. -- TODO -- -- (1) Given a list of diagnostic positions, implement the contamination -- check. Structure of the code can be stolen from ccheck in the -- mia package. -- -- - What do we do about wrapped alignments? Mia has f/b/a labels, -- BAM doesn't. We can see if it overhangs, though. -- -- (2) Given a high-coverage sample, genotype call it and derive the -- diagnostic positions. -- -- - This method needs some definition of the contaminant consensus -- thingy. -- -- (3) Given a `correct' sample sequence, align it to the reference and -- derive diagnostic positions from that. -- -- - Needs the same description of the contaminant thingy. -- -- (4) Consider Read Groups. -- -- - One result per read group (or maybe per library, alternatively -- per file) should be produced. -- - The "aDNA" setting should be determined from either the @RG -- header or from an external source. import Bio.Base import Bio.Bam hiding ( Unknown ) import Control.Applicative import Control.Monad import Data.Bits import Data.List import Numeric import System.Console.GetOpt import System.Environment import System.Exit import System.IO import qualified Data.HashMap.Strict as HM import qualified Data.IntMap as IM data Conf = Conf { conf_adna :: Adna, conf_verbosity :: Int, conf_header :: HeaderFn, conf_output :: OutputFn, conf_shoot_foot :: Bool, conf_dp_list :: DpList } options :: [OptDescr (Conf -> IO Conf)] options = public_options ++ hidden_options where public_options = [ Option "a" ["ancient","dsprot"] (NoArg (set_adna ancientDNAds)) "Treat DNA as ancient, double strand protocol", Option "s" ["ssprot"] (NoArg (set_adna ancientDNAss)) "Treat DNA as ancient, single strand protocol", Option "" ["fresh"] (NoArg (set_adna freshDNA)) "Treat DNA as fresh (not ancient)", Option "T" ["table"] (NoArg set_output_table) "Print output in table form", Option "v" ["verbose"] (NoArg (mod_verbosity succ)) "Produce more debug output", Option "q" ["quiet"] (NoArg (mod_verbosity pred)) "Produce less debug output", Option "h?" ["help","usage"] (NoArg usage) "Print this message and exit" ] hidden_options = [ Option "" ["shoot","foot"] (NoArg set_shoot_foot) [] ] usage _ = do pn <- getProgName hPutStrLn stderr $ usageInfo ("Usage: " ++ pn ++ " [OPTION...] [Bam-File...]") public_options exitSuccess set_shoot_foot c = return $ c { conf_shoot_foot = True } set_adna a c = return $ c { conf_adna = a } set_output_table c = return $ c { conf_output = show_result_table, conf_header = header_table } mod_verbosity f c = return $ c { conf_verbosity = f (conf_verbosity c) } conf0 :: IO Conf conf0 = return $ Conf { conf_adna = freshDNA , conf_verbosity = 1 , conf_header = "" , conf_output = show_result_plain , conf_shoot_foot = False , conf_dp_list = error "no diagnostic positions defined" } {- Old options... may or may not be of much use. struct option longopts[] = { { "reference", required_argument, 0, 'r' }, { "transversions", no_argument, 0, 't' }, { "span", required_argument, 0, 's' }, { "maxd", required_argument, 0, 'd' }, } ; void usage( const char* pname ) { "Reads a maln file and tries to quantify contained contamination.\n" "Options:\n" " -r, --reference FILE FASTA file with the likely contaminant (default: builtin mt311)\n" " -t, --transversions Treat only transversions as diagnostic\n" " -s, --span M-N Look only at range from M to N\n" " -n, --numpos N Require N diagnostic sites in a single read (default: 1)\n" } -} -- | A list of diagnostic positions. We drop the failed idea of -- "weakly diagnostic positions". We also work in the coordinate system -- of the reference. Therefore, a diagnostic position is defined by -- position, allele in the clean sample and allele in the contaminant. data Dp = Dp { _dp_clean_allele :: !Nucleotide , _dp_dirty_allele :: !Nucleotide } deriving Show type DpList = IM.IntMap Dp show_dp_list :: DpList -> ShowS show_dp_list = flip IM.foldrWithKey id $ \pos (Dp cln drt) k -> (:) '<' . shows pos . (:) ':' . shows drt . (:) ',' . shows cln . (++) ">, " . k -- | Reads are classified into one of these. data Klass = Unknown | Clean | Dirty | Conflict | Nonsense deriving (Ord, Eq, Enum, Bounded, Show) instance Monoid Klass where mempty = Unknown Clean `mappend` Dirty = Conflict Dirty `mappend` Clean = Conflict x `mappend` y = if x < y then y else x newtype Summary = Summary (IM.IntMap Int) sum_count :: Klass -> Summary -> Summary sum_count kl (Summary m) = Summary $ IM.insertWith' (+) (fromEnum kl) 1 m sum_get :: Klass -> Summary -> Int sum_get kl (Summary m) = IM.findWithDefault 0 (fromEnum kl) m -- | Determines what an allele could come from. Does not take -- port-mortem modifications into account. classify :: Dp -> Nucleotide -> Klass classify (Dp cln drt) nuc | maybe_clean && maybe_dirty = Unknown | maybe_clean = Clean | maybe_dirty = Dirty | otherwise = Nonsense where maybe_clean = unN cln .&. unN nuc /= 0 maybe_dirty = unN drt .&. unN nuc /= 0 -- | We deal with aDNA by transforming a base into all the bases it -- could have been. So the configuration is simply the transformation -- function. type Adna = Nucleotide -> Nucleotide -- | Fresh DNA: no transformation. freshDNA :: Adna freshDNA = id -- | Ancient DNA, single strand protocol. Deamination can turn C into T -- only. ancientDNAss :: Adna ancientDNAss = N . app . unN where app x = if x .&. unN nucT /= 0 then x .|. unN nucC else x -- | Ancient DNA, double strand protocol. Deamination can turn C into T -- and G into A. ancientDNAds :: Adna ancientDNAds = N . app1 . app2 . unN where app1 x = if x .&. unN nucT /= 0 then x .|. unN nucC else x app2 x = if x .&. unN nucA /= 0 then x .|. unN nucG else x -- | Classifying a read. In an ideal world, we'd be looking at a single -- read mapped in one piece. Instead, we may be looking at half a mate -- pair or even a single read mapped inconveniently across the origin. -- -- We will be reading a BAM stream. All reads with the same name (there -- maybe 1..4, assuming no major breakage) need to be processed -- together. We'll isolate that here: our input stream consists of -- reads that all have the same qname. Results in exactly one 'Klass'. -- We will ignore mate pairs that are improperly mapped or filtered. -- -- May need more options. Note that application of the aDNA function -- depends on the strandedness of the alignment. FIXME -- -- This is the only place where counting of votes was used before, and -- only for debugging purposes. Everything that was either dirty or -- clean (but not both) counted as a vote. classify_read_set :: Monad m => DpList -> Adna -> Iteratee [BamRaw] m Klass classify_read_set = undefined -- | Classifying a stream. We create a map from read name to iteratee. -- New names are inserted, known names fed to stored iteratees. -- ``Done'' iteratees are disposed of immediately. classify_stream :: Monad m => DpList -> Adna -> Iteratee [BamRaw] m Summary classify_stream dps adna = foldStreamM classify_read (Summary IM.empty, HM.empty) >>= lift . finish where classify0 = classify_read_set dps adna classify_read (summary, iters) rd = do let nm = b_qname $ unpackBam rd let it = HM.lookupDefault classify0 nm iters (isdone, it') <- enumPure1Chunk [rd] it >>= enumCheckIfDone if isdone then do cl <- run it' return (sum_count cl summary, HM.delete nm iters) else return (summary, HM.insert nm it' iters) finish (summary, iters) = foldM (\s it -> flip sum_count s `liftM` run it) summary $ HM.elems iters {- Missing from the output right now: * filename (library would be better) * alignment distance (only useful if DPs are derived from alignment) * number of difference (likewise) * number of DPs * number of DPs which are transversions -} result_labels :: [ String ] result_labels = [ "unclassified", "clean", "polluting", "conflicting", "nonsensical", "LB", "ML", "UB" ] type HeaderFn = String type OutputFn = Summary -> Maybe [Double] -> String show_result_plain :: OutputFn show_result_plain summary ests = unlines $ zipWith fmt result_labels [minBound..maxBound] ++ [[]] where labellen = (+) 2 . maximum . map length $ zipWith const result_labels [minBound..maxBound::Klass] pad n s = replicate (n - length s) ' ' ++ s fmt lbl kl = pad labellen lbl ++ " fragments: " ++ show (sum_get kl summary) ++ if kl == Dirty then maybe [] fmt_ests ests else [] fmt_ests [lb,ml,ub] = " (" ++ showFFloat (Just 1) lb " .. " ++ showFFloat (Just 1) ml " .. " ++ showFFloat (Just 1) ub "%)" header_table :: HeaderFn header_table = intercalate "\t" result_labels show_result_table :: OutputFn show_result_table summary ests = intercalate "\t" $ [ show $ sum_get kl summary | kl <- [minBound..maxBound] ] ++ maybe (replicate 3 "N/A") (map (\x -> showFFloat (Just 1) x [])) ests show_result_with :: (Summary -> Maybe [Double] -> a) -> Summary -> a show_result_with f summary = f summary (if nn /= 0 then Just [lb,ml,ub] else Nothing) where z = 1.96 -- this is Z_{0.975}, giving a 95% confidence interval k = fromIntegral (sum_get Dirty summary) n = k + fromIntegral (sum_get Clean summary) nn = sum_get Dirty summary + sum_get Clean summary p_ = k / n c = p_ + 0.5 * z * z / n w = z * sqrt( p_ * (1-p_) / n + 0.25 * z * z / (n*n) ) d = 1 + z * z / n lb = max 0 $ 100 * (c-w) / d -- lower bound of CI ml = 100 * p_ -- ML estimate ub = min 100 $ 100 * (c+w) / d -- upper bound of CI -- The following is old 'ccheck'... for reference and guidance. {- /* * Contamination Checker. Outline: * * - read the human reference (concsensus of contaminants); this will * contain ambiguity codes * - read maln file, including assembly and assembled reads * - align contaminant-consensus and assembly globally * This uses Myers' O(nd) aligner, for it grasps ambiguity codes and * runs fast enough, in little memory, for long, but similar * sequences. * - find "strongly diagnostic positions", positions where ass and con * are incompatible, and "weakly diagnostic positions", positions * where ass and con are not always equal * - for every "end" fragment: store it and later join with its other * half to give an effectively "full" fragment * - for every "full" fragment: if it crosses at least one (strongly or * weakly) diagnostic position, cut out that range from ref and align * to it globally using the mia aligner * - pass 1: for every weakly diagnostic position where the bases agree, * store whether a contaminant was discovered, and if so, turn them * into "actually diagnostic positions". * - pass 2: for every (strongly or actually) diagnostic position where * the bases agree, classify it, then classify the fragment * (conflicting, uninformative, contaminant, endogenous) * - produce a summary * * Notable features: * - operates sensibly on aDNA * - has sensible commandline and doesn't make too much noise in operation * - optionally considers only certain diagnostic positions * (tranversions only and/or some region only) * - new consensus sequence has other letters besides N */ // Everything that differs is weakly diagnostic, unless it's a gap. // Note that this mean that Ns are usually weakly diagnostic. bool is_diagnostic( char aln1, char aln2 ) { return aln1 != '-' && aln2 != '-' && toupper(aln1) != toupper(aln2) ; } // Interesting question... given ambiguity codes, what's a transversion? // One way to put it: anything that is incompatible with all four // transitions. Needs a different implementation. bool is_transversion( char a, char b ) { char u = a & ~32 ; char v = b & ~32 ; switch( u ) { case 'A': return v != 'G' ; case 'C': return v != 'T' ; case 'G': return v != 'A' ; case 'T': case 'U': return v != 'C' ; default: return false ; } } dp_list mk_dp_list( const char* aln1, const char* aln2, int span_from, int span_to ) { dp_list l ; int index = 0 ; while( index != span_from && *aln1 && *aln2 ) { if( *aln2 != '-' ) ++index ; ++aln1 ; ++aln2 ; } while( index != span_to && *aln1 && *aln2 ) { if( is_diagnostic( *aln1, *aln2 ) ) { l[index].consensus = *aln1 ; l[index].assembly = *aln2 ; } if( *aln2 != '-' ) ++index ; ++aln1 ; ++aln2 ; } return l ; } -} -- We won't keep this. Mt311 should be stored as half a Dp list. -- extern char mt311_sequence[] ; -- extern const int mt311_sequence_size ; main :: IO () main = do (opts, files, errors) <- getOpt Permute options <$> getArgs unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure Conf{..} <- foldl (>>=) conf0 opts {- bool transversions = false ; int min_diag_posns = 1 ; int maxd = 0 ; int span_from = 0, span_to = INT_MAX ; int opt ; do { opt = getopt_long( argc, argv, "r:avhts:d:n:MfTF", longopts, 0 ) ; switch( opt ) { case 'r': read_fasta_ref( &hum_ref, optarg ) ; break ; case 't': transversions = true ; break ; case 's': sscanf( optarg, "%u-%u", &span_from, &span_to ) ; if( span_from ) span_from-- ; break ; case 'n': min_diag_posns = atoi( optarg ) ; break ; case 'd': maxd = atoi( optarg ) ; break ; } } while( opt != -1 ) ; -} when (IM.size conf_dp_list < 40 && not conf_shoot_foot) $ do hPutStrLn stderr $ "\n *** Low number (" ++ shows (IM.size conf_dp_list) ") of diagnostic positions found.\n\ \ *** I will stop now for your own safety.\n\ \ *** If you are sure you want to shoot yourself\n\ \ *** in the foot, read the man page to learn\n\ \ *** how to lift this restriction.\n\n" exitFailure -- TODO We will usually want to seek to the mitochondrion, which -- doesn't work with the simple 'mergeInputs' invocation. r <- mergeInputs combineCoordinates files >=> run $ \hdr -> classify_stream conf_dp_list conf_adna putStrLn $ unlines $ conf_header : show_result_with conf_output r : [] {- if( mktable ) { fputs( infile.c_str(), stdout ) ; putchar( '\t' ) ; } else { puts( infile.c_str() ) ; putchar( '\n' ) ; } -} -- if( !maxd ) maxd = max( strlen(hum_ref.seq), strlen(maln->ref->seq) ) / 10 ; -- char *aln_con = (char*)malloc( strlen(hum_ref.seq) + maxd + 2 ) ; -- char *aln_ass = (char*)malloc( strlen(maln->ref->seq) + maxd + 2 ) ; -- unsigned d = myers_diff( hum_ref.seq, myers_align_globally, maln->ref->seq, maxd, aln_con, aln_ass ) ; {- if( d == UINT_MAX ) { fprintf( stderr, "\n *** Could not align references with up to %d mismatches.\n" " *** This is usually a sign of trouble, but\n" " *** IF AND ONLY IF YOU KNOW WHAT YOU ARE DOING, you can\n" " *** try the -d N option with N > %d.\n\n", maxd, maxd ) ; return 1 ; } if( mktable ) printf( "%d\t", d ) ; else printf( " %d alignment distance between reference and assembly.\n", d ) ; if( verbose >= 6 ) print_aln( aln_con, aln_ass ) ; dp_list l = mk_dp_list( aln_con, aln_ass, span_from, span_to ) ; if( mktable ) printf( "%u\t", (unsigned)l.size() ) ; else printf( " %u total differences between reference and assembly.\n", (unsigned)l.size() ) ; int num_strong = 0 ; for( dp_list::const_iterator i = l.begin() ; i != l.end() ; ++i ) if( i->second.strength > weak ) ++num_strong ; if( mktable ) printf( "%d\t", (int)l.size() ) ; else { printf( " %d diagnostic positions", (int)l.size() ) ; if( span_from != 0 || span_to != INT_MAX ) printf( " in range [%d,%d)", span_from, span_to ) ; printf( ", %d of which are strongly diagnostic.\n", num_strong ) ; } if( verbose >= 3 ) { print_dp_list( stderr, l.begin(), l.end(), '\n', 0 ) ; print_dp_list( stderr, l.begin(), l.end(), '\n', 1 ) ; } -} {- if( verbose >= 2 ) fputs( "Pass one: finding actually diagnostic positions.\n", stderr ) ; for( const AlnSeqP *s = maln->AlnSeqArray ; s != maln->AlnSeqArray + maln->num_aln_seqs ; ++s ) { fixup_name( *s ) ; std::string the_ass( maln->ref->seq + (*s)->start, (*s)->end - (*s)->start + 1 ) ; // are we overlapping anything at all? std::pair< dp_list::const_iterator, dp_list::const_iterator > p = overlapped_diagnostic_positions( l, *s ) ; if( verbose >= 3 ) { fprintf( stderr, "%s/%c:\n %d potentially diagnostic positions", (*s)->id, (*s)->segment, (int)std::distance( p.first, p.second ) ) ; if( verbose >= 4 ) { putc( ':', stderr ) ; putc( ' ', stderr ) ; print_dp_list( stderr, p.first, p.second, 0 ) ; } fprintf( stderr, "; range: %d..%d\n", (*s)->start, (*s)->end ) ; } -} {- int t = 0 ; for( dp_list::const_iterator i = l.begin() ; i != l.end() ; ++i ) if( is_transversion( i->second.consensus, i->second.assembly ) ) ++t ; if( mktable ) printf( "%d\t%d\t", t, num_strong ) ; else { printf( " %d effectively diagnostic positions", (int)l.size() ) ; if( span_from != 0 || span_to != INT_MAX ) printf( " in range [%d,%d)", span_from, span_to ) ; printf( ", %d of which are transversions.\n\n", t ) ; } if( verbose >= 3 ) print_dp_list( stderr, l.begin(), l.end(), '\n' ) ; std::deque< cached_pwaln >::const_iterator cpwaln = cached_pwalns.begin() ; for( const AlnSeqP *s = maln->AlnSeqArray ; s != maln->AlnSeqArray + maln->num_aln_seqs ; ++s, ++cpwaln ) { whatsit klass = unknown ; int votes = 0, votes2 = 0 ; std::string the_ass( maln->ref->seq + (*s)->start, (*s)->end - (*s)->start + 1 ) ; // enough overlap? (we only have _actually_ diagnostic positions now) std::pair< dp_list::const_iterator, dp_list::const_iterator > p = overlapped_diagnostic_positions( l, *s ) ; if( std::distance( p.first, p.second ) < min_diag_posns ) { if( verbose >= 3 ) { fputs( (*s)->id, stderr ) ; putc( '/', stderr ) ; putc( (*s)->segment, stderr ) ; fputs( ": no diagnostic positions\n", stderr ) ; } } else { if( verbose >= 3 ) { fprintf( stderr, "%s/%c: %d diagnostic positions", (*s)->id, (*s)->segment, (int)std::distance( p.first, p.second ) ) ; if( verbose >= 4 ) { putc( ':', stderr ) ; putc( ' ', stderr ) ; print_dp_list( stderr, p.first, p.second, 0 ) ; } fprintf( stderr, "; range: %d..%d\n", (*s)->start, (*s)->end ) ; } // Hmm, all this iterator business is somewhat lacking... char *paln1 = aln_con, *paln2 = aln_ass ; int ass_pos = 0 ; while( ass_pos != (*s)->start && *paln1 && *paln2 ) { if( *paln2 != '-' ) ass_pos++ ; ++paln1 ; ++paln2 ; } char *in_ass = maln->ref->seq + (*s)->start ; char *in_frag_v_ass = (*s)->seq ; std::string::const_iterator in_frag_v_ref = cpwaln->frag_seq.begin() ; std::string lifted = lift_over( aln_con, aln_ass, (*s)->start, (*s)->end + 1 ) ; std::string in_ref = lifted.substr( 0, cpwaln->start ) ; in_ref.append( cpwaln->ref_seq ) ; while( ass_pos != (*s)->end +1 && *paln1 && *paln2 && !in_ref.empty() && *in_ass && *in_frag_v_ass && *in_frag_v_ref ) { if( *paln1 != '-' ) { do { in_ref=in_ref.substr(1) ; in_frag_v_ref++ ; } while( in_ref[0] == '-' ) ; } if( *paln2 != '-' ) { ass_pos++ ; do { in_ass++ ; in_frag_v_ass++ ; } while( *in_ass == '-' ) ; } ++paln1 ; ++paln2 ; } if( verbose >= 4 ) putc( '\n', stderr ) ; } } } } -}