{-# LANGUAGE RecordWildCards, NamedFieldPuns, BangPatterns, TypeFamilies #-}
-- Estimates aDNA damage.  Crude first version.
--
-- - Read or subsample a BAM file, make compact representation of the reads.
-- - Compute likelihood of each read under simple model of
--   damage, error/divergence, contamination.
--
-- For the fitting, we simplify radically: ignore sequencing error,
-- assume damage and simple, symmetric substitutions which subsume error
-- and divergence.
--
-- Trying to compute symbolically is too much, the high power terms get
-- out of hand quickly, and we get mixed powers of \lambda and \kappa.
-- The fastest version so far uses the cheap implementation of automatic
-- differentiation in AD.hs together with the Hager-Zhang method from
-- package nonlinear-optimization.  BFGS from hmatrix-gsl takes longer
-- to converge.  Didn't try an actual Newton iteration (yet?), AD from
-- package ad appears slower.
--
-- If I include parameters, whose true value is zero, the transformation
-- to the log-odds-ratio doesn't work, because then the maximum doesn't
-- exist anymore.  For many parameters, zero makes sense, but one
-- doesn't.  A different transformation ('sigmoid2'/'isigmoid2'
-- below) allows for an actual zero (but not an actual one), while
-- avoiding ugly boundary conditions.  That appears to work well.
--
-- The current hack assumes all molecules have an overhang at both ends,
-- then each base gets deaminated with a position dependent probability
-- following a geometric distribution.  If we try to model a fraction of
-- undeaminated molecules (a contaminant) in addition, this fails.  To
-- rescue the idea, I guess we must really decide if the molecule has an
-- overhang at all (probability 1/2) at each end, then deaminate it.
--
-- TODO
--   - needs better output
--   - needs support for multiple input files
--   - needs to deal with long (unmerged) reads (by ignoring them?)

import Bio.Bam.Header
import Bio.Bam.Index
import Bio.Bam.Rec
import Bio.Base
import Bio.Genocall.Adna
import Bio.Genocall.Metadata
import Bio.Iteratee
import Bio.Util.AD
import Bio.Util.AD2
import Bio.Util.Numeric
import Control.Applicative
import Control.Concurrent.Async
import Control.Monad                ( unless )
import Data.Bits
import Data.Foldable
import Data.Ix
import Data.Maybe
import Data.String                  ( fromString )
import Data.Text                    ( unpack )
import System.Console.GetOpt
import System.Environment
import System.Exit
import System.FilePath
import System.IO                    ( hPutStrLn )

import qualified Data.HashMap.Strict        as M
import qualified Data.Vector                as V
import qualified Data.Vector.Generic        as G
import qualified Data.Vector.Unboxed        as U

import Prelude hiding ( sequence_, mapM, mapM_, concatMap, sum, minimum, foldr1, foldl )

-- | Roughly @Maybe (Nucleotide, Nucleotide)@, encoded compactly
newtype NP = NP { unNP :: Word8 } deriving (Eq, Ord, Ix)
data Seq = Merged   { unSeq :: U.Vector Word8 }
         | Mate1st  { unSeq :: U.Vector Word8 }
         | Mate2nd  { unSeq :: U.Vector Word8 }

instance Show NP where
    show (NP w)
        | w  ==  16 = "NN"
        | w   >  16 = "XX"
        | otherwise = [ "ACGT" !! fromIntegral (w `shiftR` 2)
                      , "ACGT" !! fromIntegral (w .&. 3) ]


{-# INLINE lk_fun1 #-}
lk_fun1 :: (Num a, Show a, Fractional a, Floating a, Memorable a)
        => Int -> Int -> [a] -> V.Vector Seq -> a
lk_fun1 lmin lmax parms = case length parms of
    1 -> V.foldl' (\a b -> a - log (lk tab00 tab00 tab00 b)) 0 . guardV           -- undamaged case
      where
        !tab00 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 0 x
                                                 | (_,_,x) <- range my_bounds ]

    4 -> V.foldl' (\a b -> a - log (lk tabDS tabDS1 tabDS1 b)) 0 . guardV           -- double strand case
      where
        !tabDS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d p_e x
                                                 | (l,i,x) <- range my_bounds
                                                 , let p_d = mu $ lambda ^^ (1+i)
                                                 , let p_e = mu $ lambda ^^ (l-i) ]

        !tabDS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

    5 -> V.foldl' (\a b -> a - log (lk tabSS tabSS1 tabSS2 b)) 0 . guardV           -- single strand case
      where
        !tabSS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                 | (l,i,x) <- range my_bounds
                                                 , let lam5 = lambda ^^ (1+i) ; lam3 = kappa ^^ (l-i)
                                                 , let p_d = mu $ lam3 + lam5 - lam3 * lam5 ]

        !tabSS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

        !tabSS2 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 p_d x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

    _ -> error "Not supposed to happen:  unexpected number of model parameters."
  where
    ~(l_subst : ~(l_sigma : ~(l_delta : ~(l_lam : ~(l_kap : _))))) = parms

    p_subst = 0.33333 * sigmoid2 l_subst
    sigma   = sigmoid2 l_sigma
    delta   = sigmoid2 l_delta
    lambda  = sigmoid2 l_lam
    kappa   = sigmoid2 l_kap

    guardV = V.filter (\u -> U.length (unSeq u) >= lmin && U.length (unSeq u) <= lmax)

    -- Likelihood given precomputed damage table.  We compute the giant
    -- table ahead of time, which maps length, index and base pair to a
    -- likelihood.
    lk tab_m     _     _ (Merged b) = U.ifoldl' (\a i np -> a * tab_m `bang` index' my_bounds (U.length b, i, NP np)) 1 b
    lk     _ tab_f     _ (Mate1st  b) = U.ifoldl' (\a i np -> a * tab_f `bang` index' my_bounds (U.length b, i, NP np)) 1 b
    lk     _     _ tab_s (Mate2nd b) = U.ifoldl' (\a i np -> a * tab_s `bang` index' my_bounds (U.length b, i, NP np)) 1 b

    index' bnds x | inRange bnds x = index bnds x
                  | otherwise = error $ "Huh? " ++ show x ++ " \\nin " ++ show bnds

    my_bounds = ((lmin,0,NP 0),(lmax,lmax,NP 16))
    mu p = sigma * p + delta * (1-p)


-- Likelihood for a certain pair of bases given error rate, C-T-rate
-- and G-A rate.
l_epq :: (Num a, Fractional a, Floating a) => a -> a -> a -> NP -> a
l_epq e p q (NP x) = case x of {
     0 -> s         ;  1 -> e         ;  2 -> e         ;  3 -> e         ;
     4 -> e         ;  5 -> s-p+4*e*p ;  6 -> e         ;  7 -> e+p-4*e*p ;
     8 -> e+q-4*e*q ;  9 -> e         ; 10 -> s-q+4*e*q ; 11 -> e         ;
    12 -> e         ; 13 -> e         ; 14 -> e         ; 15 -> s         ;
     _ -> 1 } where s = 1 - 3 * e


lkfun :: Int -> Int -> V.Vector Seq -> U.Vector Double -> Double
lkfun lmin lmax brs parms = lk_fun1 lmin lmax (U.toList parms) brs

lkfun' :: Int -> Int -> V.Vector Seq -> [Double] -> AD
lkfun' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector parms) brs

lkfun'' :: Int -> Int -> V.Vector Seq -> [Double] -> AD2
lkfun'' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector2 parms) brs

combofn :: Int -> Int -> V.Vector Seq -> U.Vector Double -> (Double, U.Vector Double)
combofn lmin lmax brs parms = (x,g)
  where D x g = lk_fun1 lmin lmax (paramVector $ U.toList parms) brs


data Conf = Conf {
    conf_lmin :: Int,
    conf_metadata :: FilePath,
    conf_report :: String -> IO (),
    conf_params :: Parameters }

defaultConf :: Conf
defaultConf = Conf 25 (error "no config file specified") (\_ -> return ()) quietParameters

options :: [OptDescr (Conf -> IO Conf)]
options = [
    Option "m"  ["min-length"]   (ReqArg set_lmin  "LEN") "Set minimum length to LEN (25)",
    Option "c"  ["config"]       (ReqArg set_conf "FILE") "Configuiration is stored in FILE",
    Option "v"  ["verbose"]      (NoArg      set_verbose) "Print progress reports",
    Option "h?" ["help","usage"] (NoArg       disp_usage) "Print this message and exit" ]
  where
    set_lmin  a c = readIO a >>= \l -> return $ c { conf_lmin     = l }
    set_conf  f c = return $ c { conf_metadata = f }
    set_verbose c = return $ c { conf_report   = hPutStrLn stderr, conf_params = debugParameters }

    disp_usage  _ = do pn <- getProgName
                       let blah = "Usage: " ++ pn ++ " [OPTION...] [LIBRARY-NAME...]"
                       putStrLn $ usageInfo blah options
                       exitSuccess

main :: IO ()
main = do
    (opts, lnames, errors) <- getOpt Permute options <$> getArgs
    unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure
    conf <- foldl (>>=) (return defaultConf) opts
    mapM_ (main' conf) lnames

main' :: Conf -> String -> IO ()
main' Conf{..} lname = do
    [Library _ fs _] <- return . filter ((fromString lname ==) . library_name) . concatMap sample_libraries . M.elems
                        =<< readMetadata conf_metadata

    -- XXX  meh.  subsampling from multiple files is not yet supported :(
    brs <- subsampleBam (takeDirectory conf_metadata </> unpack (head fs)) >=> run $ \_ ->
           joinI $ filterStream (\b -> not (isUnmapped (unpackBam b)) && G.length (b_seq (unpackBam b)) >= conf_lmin) $
           joinI $ takeStream 100000 $
           joinI $ mapStream pack_record $
           joinI $ filterStream (\u -> U.length (U.filter (<16) (unSeq u)) * 10 >= 9 * U.length (unSeq u)) $
           stream2vectorN 30000

    let lmax = V.maximum $ V.map (U.length . unSeq) brs
        v0 = crude_estimate brs
        opt v = optimize conf_params 0.0001 v
                         (VFunction $ lkfun conf_lmin lmax brs)
                         (VGradient $ snd . combofn conf_lmin lmax brs)
                         (Just . VCombined $ combofn conf_lmin lmax brs)

    results <- mapConcurrently opt [ v0, U.take 4 v0, U.take 1 v0 ]

    let mlk = minimum [ finalValue st | (_,_,st) <- results ]
        tot = sum [ exp $ mlk - finalValue st | (_,_,st) <- results ]
        p l = exp (mlk - l) / tot

        [ (p_ss, [ _, ssd_sigma_, ssd_delta_, ssd_lambda, ssd_kappa ]),
          (p_ds, [ _, dsd_sigma_, dsd_delta_, dsd_lambda ]),
          (_   , [ _ ]) ] = [ (p (finalValue st), map sigmoid2 $ G.toList xs) | (xs,_,st) <- results ]

        ssd_sigma = p_ss * ssd_sigma_
        ssd_delta = p_ss * ssd_delta_
        dsd_sigma = p_ds * dsd_sigma_
        dsd_delta = p_ds * dsd_delta_

    putStrLn $ "p_{ss} = " ++ show p_ss ++ ", p_{ds} = " ++ show p_ds
    putStrLn $ show DP{..}
    updateMetadata (store_dp lname DP{..}) conf_metadata

    -- Trying to get confidence intervals.  Right now, just get the
    -- gradient and Hessian at the ML point.  Gradient should be nearly
    -- zero, Hessian should be symmetric and positive definite.
    -- (Remember, we minimized.)
    mapM_ print [ (r,s) | (_,r,s) <- results ]
    putStrLn ""
    mapM_ print [ lkfun' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ]
    putStrLn ""
    mapM_ print [ lkfun'' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ]

-- We'll require the MD field to be present.  Then we cook each read
-- into a list of paired bases.  Deleted bases are dropped, inserted
-- bases replaced with an escape code.
--
-- XXX  This is annoying... almost, but not quite the same as the code
-- in the "Pileup" module.  This also relies on MD and doesn't offer the
-- alternative of accessing a reference genome.  (The latter may not be
-- worth the trouble.)  It also resembles the 'ECig' logic from
-- "Bio.Bam.Rmdup".

pack_record :: BamRaw -> Seq
pack_record br = if isReversed b then k (revcom u1) else k u1
  where
    b@BamRec{..} = unpackBam br

    k | isMerged     b = Merged
      | isTrimmed    b = Merged
      | isSecondMate b = Mate2nd
      | otherwise      = Mate1st

    revcom = U.reverse . U.map (\x -> if x > 15 then x else xor x 15)
    u1 = U.fromList . map unNP $ go (G.toList b_cigar) (G.toList b_seq) (fromMaybe [] $ getMd b)

    go :: [Cigar] -> [Nucleotides] -> [MdOp] -> [NP]

    go (_:*0 :cs)   ns mds  = go cs ns mds
    go cs ns (MdNum  0:mds) = go cs ns mds
    go cs ns (MdDel []:mds) = go cs ns mds
    go  _ []              _ = []
    go []  _              _ = []

    go (Mat:*nm :cs) (n:ns) (MdNum mm:mds) = mk_pair n n  : go (Mat:*(nm-1):cs) ns (MdNum (mm-1):mds)
    go (Mat:*nm :cs) (n:ns) (MdRep n':mds) = mk_pair n n' : go (Mat:*(nm-1):cs) ns               mds
    go (Mat:*nm :cs)    ns  (MdDel _ :mds) =                go (Mat:* nm   :cs) ns               mds

    go (Ins:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds
    go (SMa:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds
    go (Del:*nm :cs) ns (MdDel (_:ds):mds) = go (Del:*(nm-1):cs) ns (MdDel ds:mds)
    go (Del:*nm :cs) ns (           _:mds) = go (Del:* nm   :cs) ns           mds

    go (_:cs) nd mds = go cs nd mds


esc :: NP
esc = NP 16

mk_pair :: Nucleotides -> Nucleotides -> NP
mk_pair (Ns a) = case a of 1 -> mk_pair' 0
                           2 -> mk_pair' 1
                           4 -> mk_pair' 2
                           8 -> mk_pair' 3
                           _ -> const esc
  where
    mk_pair' u (Ns b) = case b of 1 -> NP $ u .|. 0
                                  2 -> NP $ u .|. 4
                                  4 -> NP $ u .|. 8
                                  8 -> NP $ u .|. 12
                                  _ -> esc


infix 7 /%/
(/%/) :: Integral a => a -> a -> Double
0 /%/ 0 = 0
a /%/ b = fromIntegral a / fromIntegral b

-- Crude estimate.  Need two overhang lengths, two deamination rates,
-- undamaged fraction, SS/DS, substitution rate.
--
-- DS or SS: look whether CT or GA is greater at 3' terminal position  √
-- Left overhang length:  ratio of damage at second position to first  √
-- Right overang length:  ratio of CT at last to snd-to-last posn      √
--                      + ratio of GA at last to snd-to-last posn      √
-- SS rate: condition on damage on one end, compute rate at other      √
-- DS rate: condition on damage, compute rate in interior              √
-- substitution rate:  count all substitutions not due to damage       √
-- undamaged fraction:  see below                                      √
--
-- Contaminant fraction:  let f5 (f3, f1) be the fraction of reads
-- showing damage at the 5' end (3' end, both ends).  Let a (b) be
-- the probability of an endogenous reads to show damage at the 5'
-- end (3' end).  Let e be the fraction of endogenous reads.  Then
-- we have:
--
-- f5 = e * a
-- f3 = e * b
-- f1 = e * a * b
--
-- f5 * f3 / f1 = e
--
-- Straight forward and easy to understand, but in practice, this method
-- produces ridiculous overestimates, ridiculous underestimates,
-- negative contamination rates, and general grief.  It's actually
-- better to start from a constant number.


crude_estimate :: V.Vector Seq -> U.Vector Double
crude_estimate seqs0 = U.fromList [ l_subst, l_sigma, l_delta, l_lam, l_kap ]
  where
    seqs = V.filter ((>= 10) . U.length) $ V.map unSeq seqs0

    total_equals = V.sum (V.map (U.length . U.filter      isNotSubst) seqs)
    total_substs = V.sum (V.map (U.length . U.filter isOrdinarySubst) seqs) * 6 `div` 5
    l_subst = isigmoid2 $ max 0.001 $ total_substs /%/ (total_equals + total_substs)

    c_to_t, g_to_a, c_to_c :: Word8
    c_to_t = 7
    g_to_a = 8
    c_to_c = 5

    isNotSubst x = x < 16 && x `shiftR` 2 == x .&. 3
    isOrdinarySubst x = x < 16 && x `shiftR` 2 /= x .&. 3 &&
                        x /= c_to_t && x /= g_to_a

    ct_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_t && dmg_omega v) seqs
    cc_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_c && dmg_omega v) seqs
    ct_at_beta  = V.length $ V.filter (\v -> v U.! 1 == c_to_t && dmg_omega v) seqs
    cc_at_beta  = V.length $ V.filter (\v -> v U.! 1 == c_to_c && dmg_omega v) seqs

    dmg_omega v = v U.! (l-1) == c_to_t || v U.! (l-1) == g_to_a
               || v U.! (l-2) == c_to_t || v U.! (l-2) == g_to_a
               || v U.! (l-3) == c_to_t || v U.! (l-3) == g_to_a
        where l = U.length v

    l_lam = isigmoid2 lambda
    lambda = min 0.9 $ max 0.1 $
                (ct_at_beta * (cc_at_alpha + ct_at_alpha)) /%/
                ((cc_at_beta + ct_at_beta) * ct_at_alpha)

    ct_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_t && dmg_alpha v) seqs
    cc_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_c && dmg_alpha v) seqs
    ct_at_psi   = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_t && dmg_alpha v) seqs
    cc_at_psi   = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_c && dmg_alpha v) seqs

    dmg_alpha v = v U.! 0 == c_to_t || v U.! 1 == c_to_t || v U.! 2 == c_to_t

    l_kap = isigmoid2 $ min 0.9 $ max 0.1 $
                (ct_at_psi * (cc_at_omega+ct_at_omega)) /%/
                ((cc_at_psi+ct_at_psi) * ct_at_omega)

    total_inner_CCs = V.sum $ V.map (U.length . U.filter (== c_to_c) . takeInner) seqs
    total_inner_CTs = V.sum $ V.map (U.length . U.filter (== c_to_t) . takeInner) seqs
    takeInner v = U.slice 5 (U.length v - 10) v

    delta = (total_inner_CTs /%/ (total_inner_CTs+total_inner_CCs))
    raw_rate = ct_at_alpha /%/ (ct_at_alpha + cc_at_alpha)

    -- clamping is necessary if f_endo ends up wrong
    l_delta = isigmoid2 $ min 0.99 delta
    l_sigma = isigmoid2 . min 0.99 $ raw_rate / lambda


class Memorable a where
    type Memo a :: *

    fromListN :: Int -> [a] -> Memo a
    bang :: Memo a -> Int -> a

instance Memorable Double where
    type Memo Double = U.Vector Double

    fromListN = U.fromListN
    bang = (U.!)

instance Memorable AD where
    type Memo AD = (Int, U.Vector Double)

    fromListN _    [       ] = error "unexpected: tried to memorize an empty list"
    fromListN _    (C _  :_) = error "unexpected: tried to memorize a value without derivatives"
    fromListN n xs@(D _ v:_) = (1+d, U.fromListN (n * (1+d)) $ concatMap unp xs)
      where
        !d = U.length v
        unp (C a)    = a : replicate d 0
        unp (D a da) = a : U.toList da

    bang (d, v) i = D (v U.! (d*i+0)) (U.slice (d*i+1) (d-1) v)

instance Memorable AD2 where
    type Memo AD2 = (Int, U.Vector Double)

    fromListN _    [            ] = error "unexpected: tried to memorize an empty list"
    fromListN _    (C2 _     : _) = error "unexpected: tried to memorize a value without derivatives"
    fromListN n xs@(D2 _ v _ : _) = (d, U.fromListN (n * (1+d+d*d)) $ concatMap unp xs)
      where
        !d = U.length v
        unp (C2 a)        = a : replicate (d+d*d) 0
        unp (D2 a da dda) = a : U.toList da ++ U.toList dda

    bang (d, v) i = D2 (v U.! (stride*i))
                       (U.slice (stride*i+1) d v)
                       (U.slice (stride*i+1+d) (d*d) v)
      where
        stride = 1 + d + d*d


store_dp :: String -> DamageParameters Double -> Metadata -> Metadata
store_dp lname dp = M.map go1
  where
    go1 (Sample  ls af bf ts dv) = Sample (map go2 ls) af bf ts dv
    go2 (Library      nm fs dmg)
        | nm == fromString lname = Library nm fs (Just dp)
        | otherwise              = Library nm fs dmg