{-# LANGUAGE RecordWildCards, NamedFieldPuns, BangPatterns, TypeFamilies #-}
-- Estimates aDNA damage.  Crude first version.
--
-- - Read or subsample a BAM file, make compact representation of the reads.
-- - Compute likelihood of each read under simple model of
--   damage, error/divergence, contamination.
--
-- For the fitting, we simplify radically: ignore sequencing error,
-- assume damage and simple, symmetric substitutions which subsume error
-- and divergence.
--
-- Trying to compute symbolically is too much, the high power terms get
-- out of hand quickly, and we get mixed powers of \lambda and \kappa.
-- The fastest version so far uses the cheap implementation of automatic
-- differentiation in AD.hs together with the Hager-Zhang method from
-- package nonlinear-optimization.  BFGS from hmatrix-gsl takes longer
-- to converge.  Didn't try an actual Newton iteration (yet?), AD from
-- package ad appears slower.
--
-- If I include parameters, whose true value is zero, the transformation
-- to the log-odds-ratio doesn't work, because then the maximum doesn't
-- exist anymore.  For many parameters, zero makes sense, but one
-- doesn't.  A different transformation ('sigmoid2'/'isigmoid2'
-- below) allows for an actual zero (but not an actual one), while
-- avoiding ugly boundary conditions.  That appears to work well.
--
-- The current hack assumes all molecules have an overhang at both ends,
-- then each base gets deaminated with a position dependent probability
-- following a geometric distribution.  If we try to model a fraction of
-- undeaminated molecules (a contaminant) in addition, this fails.  To
-- rescue the idea, I guess we must really decide if the molecule has an
-- overhang at all (probability 1/2) at each end, then deaminate it.
--
-- TODO
--   - needs better output
--   - needs support for multiple input files
--   - needs to deal with long (unmerged) reads (by ignoring them?)

import Bio.Adna              hiding ( bang )
import Bio.Bam.Header
import Bio.Bam.Index
import Bio.Bam.Rec
import Bio.Base
import Bio.Genocall.Metadata
import Bio.Iteratee
import Bio.Util.AD
import Bio.Util.AD2
import Bio.Util.Numeric
import Control.Applicative
import Control.Concurrent.Async
import Control.Monad                ( unless )
import Data.Bits
import Data.Foldable
import Data.Ix
import Data.Maybe
import Data.String                  ( fromString )
import Data.Text                    ( unpack )
import System.Console.GetOpt
import System.Environment
import System.Exit
import System.FilePath
import System.IO                    ( hPutStrLn )

import qualified Data.HashMap.Strict        as M
import qualified Data.Vector                as V
import qualified Data.Vector.Generic        as G
import qualified Data.Vector.Unboxed        as U

import Prelude hiding ( sequence_, mapM, mapM_, concatMap, sum, minimum, foldr1, foldl )

-- | Roughly @Maybe (Nucleotide, Nucleotide)@, encoded compactly
newtype NP = NP { unNP :: Word8 } deriving (Eq, Ord, Ix)
data Seq = Merged   { unSeq :: U.Vector Word8 }
         | Mate1st  { unSeq :: U.Vector Word8 }
         | Mate2nd  { unSeq :: U.Vector Word8 }

instance Show NP where
    show (NP w)
        | w  ==  16 = "NN"
        | w   >  16 = "XX"
        | otherwise = [ "ACGT" !! fromIntegral (w `shiftR` 2)
                      , "ACGT" !! fromIntegral (w .&. 3) ]


{-# INLINE lk_fun1 #-}
lk_fun1 :: (Num a, Show a, Fractional a, Floating a, Memorable a)
        => Int -> Int -> [a] -> V.Vector Seq -> a
lk_fun1 lmin lmax parms = case length parms of
    1 -> V.foldl' (\a b -> a - log (lk tab00 tab00 tab00 b)) 0 . guardV           -- undamaged case
      where
        !tab00 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 0 x
                                                 | (_,_,x) <- range my_bounds ]

    4 -> V.foldl' (\a b -> a - log (lk tabDS tabDS1 tabDS1 b)) 0 . guardV           -- double strand case
      where
        !tabDS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d p_e x
                                                 | (l,i,x) <- range my_bounds
                                                 , let p_d = mu $ lambda ^^ (1+i)
                                                 , let p_e = mu $ lambda ^^ (l-i) ]

        !tabDS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

    5 -> V.foldl' (\a b -> a - log (lk tabSS tabSS1 tabSS2 b)) 0 . guardV           -- single strand case
      where
        !tabSS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                 | (l,i,x) <- range my_bounds
                                                 , let lam5 = lambda ^^ (1+i) ; lam3 = kappa ^^ (l-i)
                                                 , let p_d = mu $ lam3 + lam5 - lam3 * lam5 ]

        !tabSS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

        !tabSS2 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 p_d x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

    _ -> error "Not supposed to happen:  unexpected number of model parameters."
  where
    ~(l_subst : ~(l_sigma : ~(l_delta : ~(l_lam : ~(l_kap : _))))) = parms

    p_subst = 0.33333 * sigmoid2 l_subst
    sigma   = sigmoid2 l_sigma
    delta   = sigmoid2 l_delta
    lambda  = sigmoid2 l_lam
    kappa   = sigmoid2 l_kap

    guardV = V.filter (\u -> U.length (unSeq u) >= lmin && U.length (unSeq u) <= lmax)

    -- Likelihood given precomputed damage table.  We compute the giant
    -- table ahead of time, which maps length, index and base pair to a
    -- likelihood.
    lk tab_m     _     _ (Merged  b) = U.ifoldl' (\a i np -> a * tab_m `bang` index' my_bounds (U.length b, i, NP np)) 1 b
    lk     _ tab_f     _ (Mate1st b) = U.ifoldl' (\a i np -> a * tab_f `bang` index' my_bounds (U.length b, i, NP np)) 1 b
    lk     _     _ tab_s (Mate2nd b) = U.ifoldl' (\a i np -> a * tab_s `bang` index' my_bounds (U.length b, i, NP np)) 1 b

    index' bnds x | inRange bnds x = index bnds x
                  | otherwise = error $ "Huh? " ++ show x ++ " \\nin " ++ show bnds

    my_bounds = ((lmin,0,NP 0),(lmax,lmax,NP 16))
    mu p = sigma * p + delta * (1-p)


-- Likelihood for a certain pair of bases given error rate, C-T-rate
-- and G-A rate.
l_epq :: (Num a, Fractional a, Floating a) => a -> a -> a -> NP -> a
l_epq e p q (NP x) = case x of {
     0 -> s         ;  1 -> e         ;  2 -> e         ;  3 -> e         ;
     4 -> e         ;  5 -> s-p+4*e*p ;  6 -> e         ;  7 -> e+p-4*e*p ;
     8 -> e+q-4*e*q ;  9 -> e         ; 10 -> s-q+4*e*q ; 11 -> e         ;
    12 -> e         ; 13 -> e         ; 14 -> e         ; 15 -> s         ;
     _ -> 1 } where s = 1 - 3 * e


lkfun :: Int -> Int -> V.Vector Seq -> U.Vector Double -> Double
lkfun lmin lmax brs parms = lk_fun1 lmin lmax (U.toList parms) brs

lkfun' :: Int -> Int -> V.Vector Seq -> [Double] -> AD
lkfun' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector parms) brs

lkfun'' :: Int -> Int -> V.Vector Seq -> [Double] -> AD2
lkfun'' lmin lmax brs parms = lk_fun1 lmin lmax (paramVector2 parms) brs

combofn :: Int -> Int -> V.Vector Seq -> U.Vector Double -> (Double, U.Vector Double)
combofn lmin lmax brs parms = (x,g)
  where D x g = lk_fun1 lmin lmax (paramVector $ U.toList parms) brs


data Conf = Conf {
    conf_lmin :: Int,
    conf_metadata :: FilePath,
    conf_report :: String -> IO (),
    conf_params :: Parameters }

defaultConf :: Conf
defaultConf = Conf 25 (error "no config file specified") (\_ -> return ()) quietParameters

options :: [OptDescr (Conf -> IO Conf)]
options = [
    Option "m"  ["min-length"]   (ReqArg set_lmin  "LEN") "Set minimum length to LEN (25)",
    Option "c"  ["config"]       (ReqArg set_conf "FILE") "Configuiration is stored in FILE",
    Option "v"  ["verbose"]      (NoArg      set_verbose) "Print progress reports",
    Option "h?" ["help","usage"] (NoArg       disp_usage) "Print this message and exit" ]
  where
    set_lmin  a c = readIO a >>= \l -> return $ c { conf_lmin     = l }
    set_conf  f c = return $ c { conf_metadata = f }
    set_verbose c = return $ c { conf_report   = hPutStrLn stderr, conf_params = debugParameters }

    disp_usage  _ = do pn <- getProgName
                       let blah = "Usage: " ++ pn ++ " [OPTION...] [LIBRARY-NAME...]"
                       putStrLn $ usageInfo blah options
                       exitSuccess

main :: IO ()
main = do
    (opts, lnames, errors) <- getOpt Permute options <$> getArgs
    unless (null errors) $ mapM_ (hPutStrLn stderr) errors >> exitFailure
    conf <- foldl (>>=) (return defaultConf) opts
    mapM_ (main' conf) lnames

main' :: Conf -> String -> IO ()
main' Conf{..} lname = do
    [Library _ fs _] <- return . filter ((fromString lname ==) . library_name) . concatMap sample_libraries . M.elems
                        =<< readMetadata conf_metadata

    -- XXX  meh.  subsampling from multiple files is not yet supported :(
    brs <- subsampleBam (takeDirectory conf_metadata </> unpack (head fs)) >=> run $ \_ ->
           joinI $ filterStream (\b -> not (isUnmapped (unpackBam b)) && G.length (b_seq (unpackBam b)) >= conf_lmin) $
           joinI $ takeStream 100000 $
           joinI $ mapStream pack_record $
           joinI $ filterStream (\u -> U.length (U.filter (<16) (unSeq u)) * 10 >= 9 * U.length (unSeq u)) $
           stream2vectorN 30000

    let lmax = V.maximum $ V.map (U.length . unSeq) brs
        v0 = crude_estimate brs
        opt v = optimize conf_params 0.0001 v
                         (VFunction $ lkfun conf_lmin lmax brs)
                         (VGradient $ snd . combofn conf_lmin lmax brs)
                         (Just . VCombined $ combofn conf_lmin lmax brs)

    results <- mapConcurrently opt [ v0, U.take 4 v0, U.take 1 v0 ]

    let mlk = minimum [ finalValue st | (_,_,st) <- results ]
        tot = sum [ exp $ mlk - finalValue st | (_,_,st) <- results ]
        p l = exp (mlk - l) / tot

        [ (p_ss, [ _, ssd_sigma_, ssd_delta_, ssd_lambda, ssd_kappa ]),
          (p_ds, [ _, dsd_sigma_, dsd_delta_, dsd_lambda ]),
          (_   , [ _ ]) ] = [ (p (finalValue st), map sigmoid2 $ G.toList xs) | (xs,_,st) <- results ]

        ssd_sigma = p_ss * ssd_sigma_
        ssd_delta = p_ss * ssd_delta_
        dsd_sigma = p_ds * dsd_sigma_
        dsd_delta = p_ds * dsd_delta_

    putStrLn $ "p_{ss} = " ++ show p_ss ++ ", p_{ds} = " ++ show p_ds
    putStrLn $ show DP{..}
    updateMetadata (store_dp lname DP{..}) conf_metadata

    -- Trying to get confidence intervals.  Right now, just get the
    -- gradient and Hessian at the ML point.  Gradient should be nearly
    -- zero, Hessian should be symmetric and positive definite.
    -- (Remember, we minimized.)
    mapM_ print [ (r,s) | (_,r,s) <- results ]
    putStrLn ""
    mapM_ print [ lkfun' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ]
    putStrLn ""
    mapM_ print [ lkfun'' conf_lmin lmax brs (G.toList xs) | (xs,_,_) <- results ]

-- We'll require the MD field to be present.  Then we cook each read
-- into a list of paired bases.  Deleted bases are dropped, inserted
-- bases replaced with an escape code.
--
-- XXX  This is annoying... almost, but not quite the same as the code
-- in the "Pileup" module.  This also relies on MD and doesn't offer the
-- alternative of accessing a reference genome.  (The latter may not be
-- worth the trouble.)  It also resembles the 'ECig' logic from
-- "Bio.Bam.Rmdup".

pack_record :: BamRaw -> Seq
pack_record br = if isReversed b then k (revcom u1) else k u1
  where
    b@BamRec{..} = unpackBam br

    k | isMerged     b = Merged
      | isTrimmed    b = Merged
      | isSecondMate b = Mate2nd
      | otherwise      = Mate1st

    revcom = U.reverse . U.map (\x -> if x > 15 then x else xor x 15)
    u1 = U.fromList . map unNP $ go (G.toList b_cigar) (G.toList b_seq) (fromMaybe [] $ getMd b)

    go :: [Cigar] -> [Nucleotides] -> [MdOp] -> [NP]

    go (_:*0 :cs)   ns mds  = go cs ns mds
    go cs ns (MdNum  0:mds) = go cs ns mds
    go cs ns (MdDel []:mds) = go cs ns mds
    go  _ []              _ = []
    go []  _              _ = []

    go (Mat:*nm :cs) (n:ns) (MdNum mm:mds) = mk_pair n n  : go (Mat:*(nm-1):cs) ns (MdNum (mm-1):mds)
    go (Mat:*nm :cs) (n:ns) (MdRep n':mds) = mk_pair n n' : go (Mat:*(nm-1):cs) ns               mds
    go (Mat:*nm :cs)    ns  (MdDel _ :mds) =                go (Mat:* nm   :cs) ns               mds

    go (Ins:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds
    go (SMa:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds
    go (Del:*nm :cs) ns (MdDel (_:ds):mds) = go (Del:*(nm-1):cs) ns (MdDel ds:mds)
    go (Del:*nm :cs) ns (           _:mds) = go (Del:* nm   :cs) ns           mds

    go (_:cs) nd mds = go cs nd mds


esc :: NP
esc = NP 16

mk_pair :: Nucleotides -> Nucleotides -> NP
mk_pair (Ns a) = case a of 1 -> mk_pair' 0
                           2 -> mk_pair' 1
                           4 -> mk_pair' 2
                           8 -> mk_pair' 3
                           _ -> const esc
  where
    mk_pair' u (Ns b) = case b of 1 -> NP $ u .|. 0
                                  2 -> NP $ u .|. 4
                                  4 -> NP $ u .|. 8
                                  8 -> NP $ u .|. 12
                                  _ -> esc


infix 7 /%/
(/%/) :: Integral a => a -> a -> Double
0 /%/ 0 = 0
a /%/ b = fromIntegral a / fromIntegral b

-- Crude estimate.  Need two overhang lengths, two deamination rates,
-- undamaged fraction, SS/DS, substitution rate.
--
-- DS or SS: look whether CT or GA is greater at 3' terminal position  √
-- Left overhang length:  ratio of damage at second position to first  √
-- Right overang length:  ratio of CT at last to snd-to-last posn      √
--                      + ratio of GA at last to snd-to-last posn      √
-- SS rate: condition on damage on one end, compute rate at other      √
-- DS rate: condition on damage, compute rate in interior              √
-- substitution rate:  count all substitutions not due to damage       √
-- undamaged fraction:  see below                                      √
--
-- Contaminant fraction:  let f5 (f3, f1) be the fraction of reads
-- showing damage at the 5' end (3' end, both ends).  Let a (b) be
-- the probability of an endogenous reads to show damage at the 5'
-- end (3' end).  Let e be the fraction of endogenous reads.  Then
-- we have:
--
-- f5 = e * a
-- f3 = e * b
-- f1 = e * a * b
--
-- f5 * f3 / f1 = e
--
-- Straight forward and easy to understand, but in practice, this method
-- produces ridiculous overestimates, ridiculous underestimates,
-- negative contamination rates, and general grief.  It's actually
-- better to start from a constant number.


crude_estimate :: V.Vector Seq -> U.Vector Double
crude_estimate seqs0 = U.fromList [ l_subst, l_sigma, l_delta, l_lam, l_kap ]
  where
    seqs = V.filter ((>= 10) . U.length) $ V.map unSeq seqs0

    total_equals = V.sum (V.map (U.length . U.filter      isNotSubst) seqs)
    total_substs = V.sum (V.map (U.length . U.filter isOrdinarySubst) seqs) * 6 `div` 5
    l_subst = isigmoid2 $ max 0.001 $ total_substs /%/ (total_equals + total_substs)

    c_to_t, g_to_a, c_to_c :: Word8
    c_to_t = 7
    g_to_a = 8
    c_to_c = 5

    isNotSubst x = x < 16 && x `shiftR` 2 == x .&. 3
    isOrdinarySubst x = x < 16 && x `shiftR` 2 /= x .&. 3 &&
                        x /= c_to_t && x /= g_to_a

    ct_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_t && dmg_omega v) seqs
    cc_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_c && dmg_omega v) seqs
    ct_at_beta  = V.length $ V.filter (\v -> v U.! 1 == c_to_t && dmg_omega v) seqs
    cc_at_beta  = V.length $ V.filter (\v -> v U.! 1 == c_to_c && dmg_omega v) seqs

    dmg_omega v = v U.! (l-1) == c_to_t || v U.! (l-1) == g_to_a
               || v U.! (l-2) == c_to_t || v U.! (l-2) == g_to_a
               || v U.! (l-3) == c_to_t || v U.! (l-3) == g_to_a
        where l = U.length v

    l_lam = isigmoid2 lambda
    lambda = min 0.9 $ max 0.1 $
                (ct_at_beta * (cc_at_alpha + ct_at_alpha)) /%/
                ((cc_at_beta + ct_at_beta) * ct_at_alpha)

    ct_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_t && dmg_alpha v) seqs
    cc_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_c && dmg_alpha v) seqs
    ct_at_psi   = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_t && dmg_alpha v) seqs
    cc_at_psi   = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_c && dmg_alpha v) seqs

    dmg_alpha v = v U.! 0 == c_to_t || v U.! 1 == c_to_t || v U.! 2 == c_to_t

    l_kap = isigmoid2 $ min 0.9 $ max 0.1 $
                (ct_at_psi * (cc_at_omega+ct_at_omega)) /%/
                ((cc_at_psi+ct_at_psi) * ct_at_omega)

    total_inner_CCs = V.sum $ V.map (U.length . U.filter (== c_to_c) . takeInner) seqs
    total_inner_CTs = V.sum $ V.map (U.length . U.filter (== c_to_t) . takeInner) seqs
    takeInner v = U.slice 5 (U.length v - 10) v

    delta = (total_inner_CTs /%/ (total_inner_CTs+total_inner_CCs))
    raw_rate = ct_at_alpha /%/ (ct_at_alpha + cc_at_alpha)

    -- clamping is necessary if f_endo ends up wrong
    l_delta = isigmoid2 $ min 0.99 delta
    l_sigma = isigmoid2 . min 0.99 $ raw_rate / lambda


class Memorable a where
    type Memo a :: *

    fromListN :: Int -> [a] -> Memo a
    bang :: Memo a -> Int -> a

instance Memorable Double where
    type Memo Double = U.Vector Double

    fromListN = U.fromListN
    bang = (U.!)

instance Memorable AD where
    type Memo AD = (Int, U.Vector Double)

    fromListN _    [       ] = error "unexpected: tried to memorize an empty list"
    fromListN _    (C _  :_) = error "unexpected: tried to memorize a value without derivatives"
    fromListN n xs@(D _ v:_) = (1+d, U.fromListN (n * (1+d)) $ concatMap unp xs)
      where
        !d = U.length v
        unp (C a)    = a : replicate d 0
        unp (D a da) = a : U.toList da

    bang (d, v) i = D (v U.! (d*i+0)) (U.slice (d*i+1) (d-1) v)

instance Memorable AD2 where
    type Memo AD2 = (Int, U.Vector Double)

    fromListN _    [            ] = error "unexpected: tried to memorize an empty list"
    fromListN _    (C2 _     : _) = error "unexpected: tried to memorize a value without derivatives"
    fromListN n xs@(D2 _ v _ : _) = (d, U.fromListN (n * (1+d+d*d)) $ concatMap unp xs)
      where
        !d = U.length v
        unp (C2 a)        = a : replicate (d+d*d) 0
        unp (D2 a da dda) = a : U.toList da ++ U.toList dda

    bang (d, v) i = D2 (v U.! (stride*i))
                       (U.slice (stride*i+1) d v)
                       (U.slice (stride*i+1+d) (d*d) v)
      where
        stride = 1 + d + d*d


store_dp :: String -> DamageParameters Double -> Metadata -> Metadata
store_dp lname dp = M.map go1
  where
    go1 (Sample  ls af bf ts dv) = Sample (map go2 ls) af bf ts dv
    go2 (Library      nm fs dmg)
        | nm == fromString lname = Library nm fs (Just dp)
        | otherwise              = Library nm fs dmg