{-# LANGUAGE OverloadedStrings, BangPatterns, RecordWildCards #-}
-- Simple Mitochondrial Contamination Check on BAM files.
--
-- This is based on Ye Olde Contamination Check for the Neanderthal
-- genome; the method is the same (and will continue to not work on
-- modern humans), but simplified and sanitized.  Differences from
-- before:
--
-- * We use the alignment from the BAM file as is.  Earlier we would
--   have created *two* new alignments.  That is silly, however.  Two
--   new alignments should not be followed by bean counting, but by an
--   attempt to genotype both the sample and the contaminant.
--
-- * Before, the sample and contaminant sequences were fixed.  Now we
--   instead input a list of the diagnostic positions.  Instead of an
--   explicit list, the two sequences can still be used, or only the
--   contaminant can be supplied while the sample is genotype-called.


-- TODO
--
-- (1) Given a list of diagnostic positions, implement the contamination
--     check.  Structure of the code can be stolen from ccheck in the
--     mia package.
--
--     - What do we do about wrapped alignments?  Mia has f/b/a labels,
--       BAM doesn't.  We can see if it overhangs, though.
--
-- (2) Given a high-coverage sample, genotype call it and derive the
--     diagnostic positions.
--
--     - This method needs some definition of the contaminant consensus
--       thingy.
--
-- (3) Given a `correct' sample sequence, align it to the reference and
--     derive diagnostic positions from that.
--
--     - Needs the same description of the contaminant thingy.
--
-- (4) Consider Read Groups.
--
--     - One result per read group (or maybe per library, alternatively
--       per file) should be produced.
--     - The "aDNA" setting should be determined from either the @RG
--       header or from an external source.


import Bio.Base
import Bio.Bam hiding ( Unknown )
import Control.Applicative
import Control.Monad
import Data.Bits
import Data.List
import Numeric
import System.Console.GetOpt
import System.Environment
import System.Exit
import System.IO

import qualified Data.HashMap.Strict        as HM
import qualified Data.IntMap                as IM

data Conf = Conf {
        conf_adna :: Adna,
        conf_verbosity :: Int,
        conf_header :: HeaderFn,
        conf_output :: OutputFn,
        conf_shoot_foot :: Bool,
        conf_dp_list :: DpList
    }

options :: [OptDescr (Conf -> IO Conf)]
options = public_options ++ hidden_options
  where
    public_options = [
        Option "a" ["ancient","dsprot"] (NoArg (set_adna ancientDNAds)) "Treat DNA as ancient, double strand protocol",
        Option "s" ["ssprot"]           (NoArg (set_adna ancientDNAss)) "Treat DNA as ancient, single strand protocol",
        Option ""  ["fresh"]            (NoArg (set_adna     freshDNA)) "Treat DNA as fresh (not ancient)",
        Option "T" ["table"]            (NoArg        set_output_table) "Print output in table form",

        Option "v" ["verbose"]          (NoArg    (mod_verbosity succ)) "Produce more debug output",
        Option "q" ["quiet"]            (NoArg    (mod_verbosity pred)) "Produce less debug output",
        Option "h?" ["help","usage"]    (NoArg                   usage) "Print this message and exit"
      ]

    hidden_options = [
        Option ""  ["shoot","foot"]     (NoArg set_shoot_foot) []
      ]

    usage _ = do pn <- getProgName
                 hPutStrLn stderr $ usageInfo ("Usage: " ++ pn ++ " [OPTION...] [Bam-File...]") public_options
                 exitSuccess

    set_shoot_foot   c = return $ c { conf_shoot_foot = True }
    set_adna       a c = return $ c { conf_adna = a }
    set_output_table c = return $ c { conf_output = show_result_table, conf_header = header_table }
    mod_verbosity  f c = return $ c { conf_verbosity = f (conf_verbosity c) }


conf0 :: IO Conf
conf0 = return $ Conf { conf_adna = freshDNA
                      , conf_verbosity = 1
                      , conf_header = ""
                      , conf_output = show_result_plain
                      , conf_shoot_foot = False
                      , conf_dp_list = error "no diagnostic positions defined"
                      }

{- Old options... may or may not be of much use.

struct option longopts[] = {
	{ "reference", required_argument, 0, 'r' },
	{ "transversions", no_argument, 0, 't' },
	{ "span", required_argument, 0, 's' },
	{ "maxd", required_argument, 0, 'd' },
} ;

void usage( const char* pname )
{
		"Reads a maln file and tries to quantify contained contamination.\n"
		"Options:\n"
		"  -r, --reference FILE     FASTA file with the likely contaminant (default: builtin mt311)\n"
		"  -t, --transversions      Treat only transversions as diagnostic\n"
		"  -s, --span M-N           Look only at range from M to N\n"
		"  -n, --numpos N           Require N diagnostic sites in a single read (default: 1)\n"
}
-}

-- | A list of diagnostic positions.  We drop the failed idea of
-- "weakly diagnostic positions".  We also work in the coordinate system
-- of the reference.  Therefore, a diagnostic position is defined by
-- position, allele in the clean sample and allele in the contaminant.

data Dp = Dp { _dp_clean_allele :: !Nucleotide
             , _dp_dirty_allele :: !Nucleotide }
  deriving Show

type DpList = IM.IntMap Dp

show_dp_list :: DpList -> ShowS
show_dp_list = flip IM.foldrWithKey id $ \pos (Dp cln drt) k ->
    (:) '<' . shows pos . (:) ':' . shows drt .
    (:) ',' . shows cln . (++) ">, " . k


-- | Reads are classified into one of these.
data Klass = Unknown | Clean | Dirty | Conflict | Nonsense
  deriving (Ord, Eq, Enum, Bounded, Show)

instance Monoid Klass where
    mempty = Unknown
    Clean `mappend` Dirty = Conflict
    Dirty `mappend` Clean = Conflict
    x `mappend` y = if x < y then y else x

newtype Summary = Summary (IM.IntMap Int)

sum_count :: Klass -> Summary -> Summary
sum_count kl (Summary m) = Summary $ IM.insertWith' (+) (fromEnum kl) 1 m

sum_get :: Klass -> Summary -> Int
sum_get kl (Summary m) = IM.findWithDefault 0 (fromEnum kl) m


-- | Determines what an allele could come from.  Does not take
-- port-mortem modifications into account.
classify :: Dp -> Nucleotide -> Klass
classify (Dp cln drt) nuc
    | maybe_clean && maybe_dirty = Unknown
    | maybe_clean                = Clean
    |                maybe_dirty = Dirty
    | otherwise                  = Nonsense
  where
    maybe_clean = unN cln .&. unN nuc /= 0
    maybe_dirty = unN drt .&. unN nuc /= 0


-- | We deal with aDNA by transforming a base into all the bases it
-- could have been.  So the configuration is simply the transformation
-- function.
type Adna = Nucleotide -> Nucleotide

-- | Fresh DNA: no transformation.
freshDNA :: Adna
freshDNA = id

-- | Ancient DNA, single strand protocol.  Deamination can turn C into T
-- only.
ancientDNAss :: Adna
ancientDNAss = N . app . unN
  where app x = if x .&. unN nucT /= 0 then x .|. unN nucC else x

-- | Ancient DNA, double strand protocol.  Deamination can turn C into T
-- and G into A.
ancientDNAds :: Adna
ancientDNAds = N . app1 . app2 . unN
  where app1 x = if x .&. unN nucT /= 0 then x .|. unN nucC else x
        app2 x = if x .&. unN nucA /= 0 then x .|. unN nucG else x


-- | Classifying a read.  In an ideal world, we'd be looking at a single
-- read mapped in one piece.  Instead, we may be looking at half a mate
-- pair or even a single read mapped inconveniently across the origin.
--
-- We will be reading a BAM stream.  All reads with the same name (there
-- maybe 1..4, assuming no major breakage) need to be processed
-- together.  We'll isolate that here:  our input stream consists of
-- reads that all have the same qname.  Results in exactly one 'Klass'.
-- We will ignore mate pairs that are improperly mapped or filtered.
--
-- May need more options.  Note that application of the aDNA function
-- depends on the strandedness of the alignment.  FIXME
--
-- This is the only place where counting of votes was used before, and
-- only for debugging purposes.  Everything that was either dirty or
-- clean (but not both) counted as a vote.

classify_read_set :: Monad m => DpList -> Adna -> Iteratee [BamRaw] m Klass
classify_read_set = undefined

-- | Classifying a stream.  We create a map from read name to iteratee.
-- New names are inserted, known names fed to stored iteratees.
-- ``Done'' iteratees are disposed of immediately.

classify_stream :: Monad m => DpList -> Adna -> Iteratee [BamRaw] m Summary
classify_stream dps adna = foldStreamM classify_read (Summary IM.empty, HM.empty) >>= lift . finish
  where
    classify0 = classify_read_set dps adna

    classify_read (summary, iters) rd = do
        let nm = b_qname $ unpackBam rd
        let it = HM.lookupDefault classify0 nm iters
        (isdone, it') <- enumPure1Chunk [rd] it >>= enumCheckIfDone
        if isdone then do cl <- run it'
                          return (sum_count cl summary, HM.delete nm iters)
                  else return (summary, HM.insert nm it' iters)

    finish (summary, iters) = foldM (\s it -> flip sum_count s `liftM` run it) summary $ HM.elems iters


{- Missing from the output right now:

 * filename (library would be better)
 * alignment distance (only useful if DPs are derived from alignment)
 * number of difference (likewise)
 * number of DPs
 * number of DPs which are transversions
-}

result_labels :: [ String ]
result_labels = [ "unclassified", "clean", "polluting", "conflicting", "nonsensical", "LB", "ML", "UB" ]

type HeaderFn = String
type OutputFn = Summary -> Maybe [Double] -> String

show_result_plain :: OutputFn
show_result_plain summary ests = unlines $ zipWith fmt result_labels [minBound..maxBound] ++ [[]]
  where
    labellen = (+) 2 . maximum . map length $ zipWith const result_labels [minBound..maxBound::Klass]
    pad n s  = replicate (n - length s) ' ' ++ s

    fmt lbl kl = pad labellen lbl ++ " fragments: " ++ show (sum_get kl summary) ++
                 if kl == Dirty then maybe [] fmt_ests ests else []

    fmt_ests [lb,ml,ub] = " (" ++ showFFloat (Just 1) lb " .. "
                               ++ showFFloat (Just 1) ml " .. "
                               ++ showFFloat (Just 1) ub "%)"

header_table :: HeaderFn
header_table = intercalate "\t" result_labels

show_result_table :: OutputFn
show_result_table summary ests = intercalate "\t" $
    [ show $ sum_get kl summary | kl <- [minBound..maxBound] ] ++
    maybe (replicate 3 "N/A") (map (\x -> showFFloat (Just 1) x [])) ests


show_result_with :: (Summary -> Maybe [Double] -> a) -> Summary -> a
show_result_with f summary = f summary (if nn /= 0 then Just [lb,ml,ub] else Nothing)
  where
    z = 1.96   -- this is Z_{0.975}, giving a 95% confidence interval
    k =     fromIntegral (sum_get Dirty summary)
    n = k + fromIntegral (sum_get Clean summary)
    nn = sum_get Dirty summary + sum_get Clean summary

    p_ = k / n
    c = p_ + 0.5 * z * z / n
    w = z * sqrt( p_ * (1-p_) / n + 0.25 * z * z / (n*n) )
    d = 1 + z * z / n

    lb = max  0  $ 100 * (c-w) / d    -- lower bound of CI
    ml =           100 * p_           -- ML estimate
    ub = min 100 $ 100 * (c+w) / d    -- upper bound of CI


-- The following is old 'ccheck'... for reference and guidance.


{-
/*
 * Contamination Checker.  Outline:
 *
 * - read the human reference (concsensus of contaminants); this will
 *   contain ambiguity codes
 * - read maln file, including assembly and assembled reads
 * - align contaminant-consensus and assembly globally
 *   This uses Myers' O(nd) aligner, for it grasps ambiguity codes and
 *   runs fast enough, in little memory, for long, but similar
 *   sequences.
 * - find "strongly diagnostic positions", positions where ass and con
 *   are incompatible, and "weakly diagnostic positions", positions
 *   where ass and con are not always equal
 * - for every "end" fragment: store it  and later join with its other
 *   half to give an effectively "full" fragment
 * - for every "full" fragment: if it crosses at least one (strongly or
 *   weakly) diagnostic position, cut out that range from ref and align
 *   to it globally using the mia aligner
 * - pass 1: for every weakly diagnostic position where the bases agree,
 *   store whether a contaminant was discovered, and if so, turn them
 *   into "actually diagnostic positions".
 * - pass 2: for every (strongly or actually) diagnostic position where
 *   the bases agree, classify it, then classify the fragment
 *   (conflicting, uninformative, contaminant, endogenous)
 * - produce a summary
 *
 * Notable features:
 * - operates sensibly on aDNA
 * - has sensible commandline and doesn't make too much noise in operation
 * - optionally considers only certain diagnostic positions
 *   (tranversions only and/or some region only)
 * - new consensus sequence has other letters besides N
 */

// Everything that differs is weakly diagnostic, unless it's a gap.
// Note that this mean that Ns are usually weakly diagnostic.
bool is_diagnostic( char aln1, char aln2 )
{
	return aln1 != '-' && aln2 != '-' && toupper(aln1) != toupper(aln2) ;
}

// Interesting question... given ambiguity codes, what's a transversion?
// One way to put it:  anything that is incompatible with all four
// transitions.  Needs a different implementation.
bool is_transversion( char a, char b )
{
	char u = a & ~32 ;
	char v = b & ~32 ;
	switch( u )
	{
		case 'A': return v != 'G' ;
		case 'C': return v != 'T' ;
		case 'G': return v != 'A' ;
		case 'T':
		case 'U': return v != 'C' ;
		default: return false ;
	}
}


dp_list mk_dp_list( const char* aln1, const char* aln2, int span_from, int span_to )
{
	dp_list l ;
    int index = 0 ;
    while( index != span_from && *aln1 && *aln2 )
    {
		if( *aln2 != '-' ) ++index ;
		++aln1 ;
		++aln2 ;
    }
	while( index != span_to && *aln1 && *aln2 )
	{
		if( is_diagnostic( *aln1, *aln2 ) ) {
            l[index].consensus = *aln1 ;
            l[index].assembly = *aln2 ;
        }
		if( *aln2 != '-' ) ++index ;
		++aln1 ;
		++aln2 ;
	}
	return l ;
}
-}

-- We won't keep this.  Mt311 should be stored as half a Dp list.
-- extern       char mt311_sequence[] ;
-- extern const int  mt311_sequence_size ;

main :: IO ()
main = do
    (opts, files, errors) <- getOpt Permute options <$> getArgs
    unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure
    Conf{..} <- foldl (>>=) conf0 opts

{-
	bool transversions = false ;
	int min_diag_posns = 1 ;
	int maxd = 0 ;
	int span_from = 0, span_to = INT_MAX ;

	int opt ;
	do {
		opt = getopt_long( argc, argv, "r:avhts:d:n:MfTF", longopts, 0 ) ;
		switch( opt )
		{
			case 'r': read_fasta_ref( &hum_ref, optarg ) ; break ;
			case 't': transversions = true ; break ;
			case 's': sscanf( optarg, "%u-%u", &span_from, &span_to ) ; if( span_from ) span_from-- ; break ;
			case 'n': min_diag_posns = atoi( optarg ) ; break ;
			case 'd': maxd = atoi( optarg ) ; break ;
		}
	} while( opt != -1 ) ;
-}

    when (IM.size conf_dp_list < 40 && not conf_shoot_foot) $ do
        hPutStrLn stderr $
            "\n *** Low number (" ++ shows (IM.size conf_dp_list) ") of diagnostic positions found.\n\
              \ *** I will stop now for your own safety.\n\
              \ *** If you are sure you want to shoot yourself\n\
              \ *** in the foot, read the man page to learn\n\
              \ *** how to lift this restriction.\n\n"
        exitFailure

    -- TODO  We will usually want to seek to the mitochondrion, which
    -- doesn't work with the simple 'mergeInputs' invocation.
    r <- mergeInputs combineCoordinates files >=> run $ \hdr ->
            classify_stream conf_dp_list conf_adna

    putStrLn $ unlines $ conf_header : show_result_with conf_output r : []

        {-
        if( mktable ) {
            fputs( infile.c_str(), stdout ) ;
            putchar( '\t' ) ;
        }
        else {
            puts( infile.c_str() ) ;
            putchar( '\n' ) ;
        }
        -}

        -- if( !maxd ) maxd = max( strlen(hum_ref.seq), strlen(maln->ref->seq) ) / 10 ;
--         char *aln_con = (char*)malloc( strlen(hum_ref.seq) + maxd + 2 ) ;
  --       char *aln_ass = (char*)malloc( strlen(maln->ref->seq) + maxd + 2 ) ;
    --     unsigned d = myers_diff( hum_ref.seq, myers_align_globally, maln->ref->seq, maxd, aln_con, aln_ass ) ;

        {-
        if( d == UINT_MAX ) {
            fprintf( stderr, "\n *** Could not align references with up to %d mismatches.\n"
                             " *** This is usually a sign of trouble, but\n"
                             " *** IF AND ONLY IF YOU KNOW WHAT YOU ARE DOING, you can\n"
                             " *** try the -d N option with N > %d.\n\n", maxd, maxd ) ;
            return 1 ;
        }
        if( mktable ) printf( "%d\t", d ) ;
        else printf( "  %d alignment distance between reference and assembly.\n", d ) ;

        if( verbose >= 6 ) print_aln( aln_con, aln_ass ) ;

        dp_list l = mk_dp_list( aln_con, aln_ass, span_from, span_to ) ;
        if( mktable ) printf( "%u\t", (unsigned)l.size() ) ;
        else printf( "  %u total differences between reference and assembly.\n", (unsigned)l.size() ) ;

        int num_strong = 0 ;
        for( dp_list::const_iterator i = l.begin() ; i != l.end() ; ++i )
            if( i->second.strength > weak ) ++num_strong ;
        if( mktable ) printf( "%d\t", (int)l.size() ) ;
        else {
            printf( "  %d diagnostic positions", (int)l.size() ) ;
            if( span_from != 0 || span_to != INT_MAX )
                printf( " in range [%d,%d)", span_from, span_to ) ;
            printf( ", %d of which are strongly diagnostic.\n", num_strong ) ;
        }

        if( verbose >= 3 ) {
            print_dp_list( stderr, l.begin(), l.end(), '\n', 0 ) ;
            print_dp_list( stderr, l.begin(), l.end(), '\n', 1 ) ;
        }

-}

        {-
        if( verbose >= 2 ) fputs( "Pass one: finding actually diagnostic positions.\n", stderr ) ;
        for( const AlnSeqP *s = maln->AlnSeqArray ; s != maln->AlnSeqArray + maln->num_aln_seqs ; ++s )
        {
            fixup_name( *s ) ;

            std::string the_ass( maln->ref->seq + (*s)->start, (*s)->end - (*s)->start + 1 ) ;
            // are we overlapping anything at all?
            std::pair< dp_list::const_iterator, dp_list::const_iterator > p =
                overlapped_diagnostic_positions( l, *s ) ;

            if( verbose >= 3 )
            {
                fprintf( stderr, "%s/%c:\n  %d potentially diagnostic positions",
                         (*s)->id, (*s)->segment, (int)std::distance( p.first, p.second ) ) ;
                if( verbose >= 4 )
                {
                    putc( ':', stderr ) ; putc( ' ', stderr ) ;
                    print_dp_list( stderr, p.first, p.second, 0 ) ;
                }
                fprintf( stderr, "; range:  %d..%d\n", (*s)->start, (*s)->end ) ;
            }
-}


        {-
            int t = 0 ;
            for( dp_list::const_iterator i = l.begin() ; i != l.end() ; ++i )
                if( is_transversion( i->second.consensus, i->second.assembly ) ) ++t ;
            if( mktable ) printf( "%d\t%d\t", t, num_strong ) ;
            else {
                printf( "  %d effectively diagnostic positions", (int)l.size() ) ;
                if( span_from != 0 || span_to != INT_MAX )
                    printf( " in range [%d,%d)", span_from, span_to ) ;
                printf( ", %d of which are transversions.\n\n", t ) ;
            }
        if( verbose >= 3 ) print_dp_list( stderr, l.begin(), l.end(), '\n' ) ;

        std::deque< cached_pwaln >::const_iterator cpwaln = cached_pwalns.begin() ;
        for( const AlnSeqP *s = maln->AlnSeqArray ; s != maln->AlnSeqArray + maln->num_aln_seqs ; ++s, ++cpwaln )
        {
            whatsit klass = unknown ;
            int votes = 0, votes2 = 0 ;

            std::string the_ass( maln->ref->seq + (*s)->start, (*s)->end - (*s)->start + 1 ) ;
            // enough overlap?  (we only have _actually_ diagnostic positions now)
            std::pair< dp_list::const_iterator, dp_list::const_iterator > p =
                overlapped_diagnostic_positions( l, *s ) ;
            if( std::distance( p.first, p.second ) < min_diag_posns )
            {
                if( verbose >= 3 ) {
                    fputs( (*s)->id, stderr ) ;
                    putc( '/', stderr ) ;
                    putc( (*s)->segment, stderr ) ;
                    fputs( ": no diagnostic positions\n", stderr ) ;
                }
            }
            else
            {
                if( verbose >= 3 )
                {
                    fprintf( stderr, "%s/%c: %d diagnostic positions", (*s)->id, (*s)->segment, (int)std::distance( p.first, p.second ) ) ;
                    if( verbose >= 4 )
                    {
                        putc( ':', stderr ) ; putc( ' ', stderr ) ;
                        print_dp_list( stderr, p.first, p.second, 0 ) ;
                    }
                    fprintf( stderr, "; range:  %d..%d\n", (*s)->start, (*s)->end ) ;
                }

                // Hmm, all this iterator business is somewhat lacking...
                char *paln1 = aln_con, *paln2 = aln_ass ;
                int ass_pos = 0 ;
                while( ass_pos != (*s)->start && *paln1 && *paln2 )
                {
                    if( *paln2 != '-' ) ass_pos++ ;
                    ++paln1 ;
                    ++paln2 ;
                }

                char *in_ass = maln->ref->seq + (*s)->start ;
                char *in_frag_v_ass = (*s)->seq ;
                std::string::const_iterator in_frag_v_ref = cpwaln->frag_seq.begin() ;

                std::string lifted = lift_over( aln_con, aln_ass, (*s)->start, (*s)->end + 1 ) ;
                std::string in_ref = lifted.substr( 0, cpwaln->start ) ;
                in_ref.append( cpwaln->ref_seq ) ;

                while( ass_pos != (*s)->end +1 && *paln1 && *paln2 && !in_ref.empty() && *in_ass && *in_frag_v_ass && *in_frag_v_ref )
                {
                    if( *paln1 != '-' ) {
                        do {
                            in_ref=in_ref.substr(1) ;
                            in_frag_v_ref++ ;
                        } while( in_ref[0] == '-' ) ;
                    }
                    if( *paln2 != '-' ) {
                        ass_pos++ ;
                        do {
                            in_ass++ ;
                            in_frag_v_ass++ ;
                        } while( *in_ass == '-' ) ;
                    }
                    ++paln1 ;
                    ++paln2 ;
                }
                if( verbose >= 4 ) putc( '\n', stderr ) ;
            }
        }
    }
}
        -}