{-# LANGUAGE RecordWildCards, NamedFieldPuns, BangPatterns, TypeFamilies #-}
-- Estimates aDNA damage.  Crude first version.
--
-- - Read or subsample a BAM file, make compact representation of the reads.
-- - Compute likelihood of each read under simple model of
--   damage, error/divergence, contamination.
--
-- For the fitting, we simplify radically: ignore sequencing error,
-- assume damage and simple, symmetric substitutions which subsume error
-- and divergence.
--
-- Trying to compute symbolically is too much, the high power terms get
-- out of hand quickly, and we get mixed powers of \lambda and \kappa.
-- The fastest version so far uses the cheap implementation of automatic
-- differentiation in AD.hs together with the Hager-Zhang method from
-- package nonlinear-optimization.  BFGS from hmatrix-gsl takes longer
-- to converge.  Didn't try an actual Newton iteration (yet?), AD from
-- package ad appears slower.
--
-- If I include parameters, whose true value is zero, the transformation
-- to the log-odds-ratio doesn't work, because then the maximum doesn't
-- exist anymore.  For many parameters, zero makes sense, but one
-- doesn't.  A different transformation ('sigmoid2'/'isigmoid2'
-- below) allows for an actual zero (but not an actual one), while
-- avoiding ugly boundary conditions.  That appears to work well.
--
-- The current hack assumes all molecules have an overhang at both ends,
-- then each base gets deaminated with a position dependent probability
-- following a geometric distribution.  If we try to model a fraction of
-- undeaminated molecules (a contaminant) in addition, this fails.  To
-- rescue the idea, I guess we must really decide if the molecule has an
-- overhang at all (probability 1/2) at each end, then deaminate it.
--
-- TODO
--   - needs better packaging, better output
--   - needs support for multiple input files(?)
--   - needs read group awareness(?)
--   - needs to deal with long (unmerged) reads (by ignoring them?)

import Bio.Bam.Header
import Bio.Bam.Index
import Bio.Bam.Rec
import Bio.Base
import Bio.Genocall.Adna
import Bio.Iteratee
import Control.Concurrent.Async
import Data.Bits
import Data.Foldable
import Data.Ix
import Data.Maybe
import Numeric.Optimization.Algorithms.HagerZhang05
import System.Environment

import qualified Data.Vector                as V
import qualified Data.Vector.Generic        as G
import qualified Data.Vector.Unboxed        as U

import AD
import Prelude hiding ( sequence_, mapM, mapM_, concatMap, sum, minimum, foldr1 )

-- | Roughly @Maybe (Nucleotide, Nucleotide)@, encoded compactly
newtype NP = NP { unNP :: Word8 } deriving (Eq, Ord, Ix)
data Seq = Merged { unSeq :: U.Vector Word8 }
         | First  { unSeq :: U.Vector Word8 }
         | Second { unSeq :: U.Vector Word8 }

instance Show NP where
    show (NP w)
        | w  ==  16 = "NN"
        | w   >  16 = "XX"
        | otherwise = [ "ACGT" !! fromIntegral (w `shiftR` 2)
                      , "ACGT" !! fromIntegral (w .&. 3) ]


sigmoid2, isigmoid2 :: (Num a, Fractional a, Floating a) => a -> a
sigmoid2 x = y*y where y = (exp x - 1) / (exp x + 1)
isigmoid2 y = log $ (1 + sqrt y) / (1 - sqrt y)

{-# INLINE lk_fun1 #-}
lk_fun1 :: (Num a, Show a, Fractional a, Floating a, Memorable a) => Int -> [a] -> V.Vector Seq -> a
lk_fun1 lmax parms = case length parms of
    1 -> V.foldl' (\a b -> a - log (lk tab00 tab00 tab00 b)) 0 . guardV           -- undamaged case
      where
        !tab00 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 0 x
                                                 | (_,_,x) <- range my_bounds ]

    4 -> V.foldl' (\a b -> a - log (lk tabDS tabDS1 tabDS1 b)) 0 . guardV           -- double strand case
      where
        !tabDS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d p_e x
                                                 | (l,i,x) <- range my_bounds
                                                 , let p_d = mu $ lambda ^^ (1+i)
                                                 , let p_e = mu $ lambda ^^ (l-i) ]

        !tabDS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

    5 -> V.foldl' (\a b -> a - log (lk tabSS tabSS1 tabSS2 b)) 0 . guardV           -- single strand case
      where
        !tabSS = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                 | (l,i,x) <- range my_bounds
                                                 , let lam5 = lambda ^^ (1+i) ; lam3 = kappa ^^ (l-i)
                                                 , let p_d = mu $ lam3 + lam5 - lam3 * lam5 ]

        !tabSS1 = fromListN (rangeSize my_bounds) [ l_epq p_subst p_d 0 x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

        !tabSS2 = fromListN (rangeSize my_bounds) [ l_epq p_subst 0 p_d x
                                                  | (_,i,x) <- range my_bounds
                                                  , let p_d = mu $ lambda ^^ (1+i) ]

    _ -> error "Not supposed to happen:  unexpected number of model parameters."
  where
    ~(l_subst : ~(l_sigma : ~(l_delta : ~(l_lam : ~(l_kap : _))))) = parms

    p_subst = 0.33333 * sigmoid2 l_subst
    sigma   = sigmoid2 l_sigma
    delta   = sigmoid2 l_delta
    lambda  = sigmoid2 l_lam
    kappa   = sigmoid2 l_kap

    guardV = V.filter (\u -> U.length (unSeq u) >= lmin && U.length (unSeq u) <= lmax)

    -- Likelihood given precomputed damage table.  We compute the giant
    -- table ahead of time, which maps length, index and base pair to a
    -- likelihood.
    lk tab_m     _     _ (Merged b) = U.ifoldl' (\a i np -> a * tab_m `bang` index' my_bounds (U.length b, i, NP np)) 1 b
    lk     _ tab_f     _ (First  b) = U.ifoldl' (\a i np -> a * tab_f `bang` index' my_bounds (U.length b, i, NP np)) 1 b
    lk     _     _ tab_s (Second b) = U.ifoldl' (\a i np -> a * tab_s `bang` index' my_bounds (U.length b, i, NP np)) 1 b

    index' bnds x | inRange bnds x = index bnds x
                  | otherwise = error $ "Huh? " ++ show x ++ " \\nin " ++ show bnds

    my_bounds = ((lmin,0,NP 0),(lmax,lmax,NP 16))
    mu p = sigma * p + delta * (1-p)


-- Likelihood for a certain pair of bases given error rate, C-T-rate
-- and G-A rate.
l_epq :: (Num a, Fractional a, Floating a) => a -> a -> a -> NP -> a
l_epq e p q (NP x) = case x of {
     0 -> s         ;  1 -> e         ;  2 -> e         ;  3 -> e         ;
     4 -> e         ;  5 -> s-p+4*e*p ;  6 -> e         ;  7 -> e+p-4*e*p ;
     8 -> e+q-4*e*q ;  9 -> e         ; 10 -> s-q+4*e*q ; 11 -> e         ;
    12 -> e         ; 13 -> e         ; 14 -> e         ; 15 -> s         ;
     _ -> 1 } where s = 1 - 3 * e


lkfun :: Int -> V.Vector Seq -> U.Vector Double -> Double
lkfun lmax brs parms = lk_fun1 lmax (U.toList parms) brs

combofn :: Int -> V.Vector Seq -> U.Vector Double -> (Double, U.Vector Double)
combofn lmax brs parms = (x,g)
  where D x g = lk_fun1 lmax (paramVector $ U.toList parms) brs

params :: Parameters
params = defaultParameters { printFinal = False, verbose = Quiet, maxItersFac = 20 }

lmin :: Int
lmin = 25

main :: IO ()
main = do
    [fp] <- getArgs
    brs <- subsampleBam fp >=> run $ \_ ->
           joinI $ filterStream (\b -> not (isUnmapped (unpackBam b)) && G.length (b_seq (unpackBam b)) >= lmin) $
           joinI $ takeStream 100000 $
           joinI $ mapStream pack_record $
           joinI $ filterStream (\u -> U.length (U.filter (<16) (unSeq u)) * 10 >= 9 * U.length (unSeq u)) $
           stream2vectorN 30000

    let lmax = V.maximum $ V.map (U.length . unSeq) brs
        v0 = crude_estimate brs
        opt v = optimize params 0.0001 v
                         (VFunction $ lkfun lmax brs)
                         (VGradient $ snd . combofn lmax brs)
                         (Just . VCombined $ combofn lmax brs)

    results <- mapConcurrently opt [ v0, U.take 4 v0, U.take 1 v0 ]

    let mlk = minimum [ finalValue st | (_,_,st) <- results ]
        tot = sum [ exp $ mlk - finalValue st | (_,_,st) <- results ]
        p l = exp (mlk - l) / tot

        [ (p_ss, [ _, ssd_sigma_, ssd_delta_, ssd_lambda, ssd_kappa ]),
          (p_ds, [ _, dsd_sigma_, dsd_delta_, dsd_lambda ]),
          (_   , [ _ ]) ] = [ (p (finalValue st), map sigmoid2 $ G.toList xs) | (xs,_,st) <- results ]

        ssd_sigma = p_ss * ssd_sigma_
        ssd_delta = p_ss * ssd_delta_
        dsd_sigma = p_ds * dsd_sigma_
        dsd_delta = p_ds * dsd_delta_

    print DP{..}

-- We'll require the MD field to be present.  Then we cook each read
-- into a list of paired bases.  Deleted bases are dropped, inserted
-- bases replaced with an escape code.
--
-- XXX  This is annoying... almost, but not quite the same as the code
-- in the "Pileup" module.  This also relies on MD and doesn't offer the
-- alternative of accessing a reference genome.  (The latter may not be
-- worth the trouble.)  It also resembles the 'ECig' logic from
-- "Bio.Bam.Rmdup".

pack_record :: BamRaw -> Seq
pack_record br = if isReversed b then k (revcom u1) else k u1
  where
    b@BamRec{..} = unpackBam br

    k | isMerged     b = Merged
      | isTrimmed    b = Merged
      | isSecondMate b = Second
      | otherwise      = First

    revcom = U.reverse . U.map (\x -> if x > 15 then x else xor x 15)
    u1 = U.fromList . map unNP $ go (G.toList b_cigar) (G.toList b_seq) (fromMaybe [] $ getMd b)

    go :: [Cigar] -> [Nucleotides] -> [MdOp] -> [NP]

    go (_:*0 :cs)   ns mds  = go cs ns mds
    go cs ns (MdNum  0:mds) = go cs ns mds
    go cs ns (MdDel []:mds) = go cs ns mds
    go  _ []              _ = []
    go []  _              _ = []

    go (Mat:*nm :cs) (n:ns) (MdNum mm:mds) = mk_pair n n  : go (Mat:*(nm-1):cs) ns (MdNum (mm-1):mds)
    go (Mat:*nm :cs) (n:ns) (MdRep n':mds) = mk_pair n n' : go (Mat:*(nm-1):cs) ns               mds
    go (Mat:*nm :cs)    ns  (MdDel _ :mds) =                go (Mat:* nm   :cs) ns               mds

    go (Ins:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds
    go (SMa:*nm :cs) ns mds = replicate nm esc ++ go cs (drop nm ns) mds
    go (Del:*nm :cs) ns (MdDel (_:ds):mds) = go (Del:*(nm-1):cs) ns (MdDel ds:mds)
    go (Del:*nm :cs) ns (           _:mds) = go (Del:* nm   :cs) ns           mds

    go (_:cs) nd mds = go cs nd mds


esc :: NP
esc = NP 16

mk_pair :: Nucleotides -> Nucleotides -> NP
mk_pair (Ns a) = case a of 1 -> mk_pair' 0
                           2 -> mk_pair' 1
                           4 -> mk_pair' 2
                           8 -> mk_pair' 3
                           _ -> const esc
  where
    mk_pair' u (Ns b) = case b of 1 -> NP $ u .|. 0
                                  2 -> NP $ u .|. 4
                                  4 -> NP $ u .|. 8
                                  8 -> NP $ u .|. 12
                                  _ -> esc


infix 7 /%/
(/%/) :: Integral a => a -> a -> Double
0 /%/ 0 = 0
a /%/ b = fromIntegral a / fromIntegral b

-- Crude estimate.  Need two overhang lengths, two deamination rates,
-- undamaged fraction, SS/DS, substitution rate.
--
-- DS or SS: look whether CT or GA is greater at 3' terminal position  √
-- Left overhang length:  ratio of damage at second position to first  √
-- Right overang length:  ratio of CT at last to snd-to-last posn      √
--                      + ratio of GA at last to snd-to-last posn      √
-- SS rate: condition on damage on one end, compute rate at other      √
-- DS rate: condition on damage, compute rate in interior              √
-- substitution rate:  count all substitutions not due to damage       √
-- undamaged fraction:  see below                                      √
--
-- Contaminant fraction:  let f5 (f3, f1) be the fraction of reads
-- showing damage at the 5' end (3' end, both ends).  Let a (b) be
-- the probability of an endogenous reads to show damage at the 5'
-- end (3' end).  Let e be the fraction of endogenous reads.  Then
-- we have:
--
-- f5 = e * a
-- f3 = e * b
-- f1 = e * a * b
--
-- f5 * f3 / f1 = e
--
-- Straight forward and easy to understand, but in practice, this method
-- produces ridiculous overestimates, ridiculous underestimates,
-- negative contamination rates, and general grief.  It's actually
-- better to start from a constant number.


crude_estimate :: V.Vector Seq -> U.Vector Double
crude_estimate seqs0 = U.fromList [ l_subst, l_sigma, l_delta, l_lam, l_kap ]
  where
    seqs = V.filter ((>= 10) . U.length) $ V.map unSeq seqs0

    total_equals = V.sum (V.map (U.length . U.filter      isNotSubst) seqs)
    total_substs = V.sum (V.map (U.length . U.filter isOrdinarySubst) seqs) * 6 `div` 5
    l_subst = isigmoid2 $ max 0.001 $ total_substs /%/ (total_equals + total_substs)

    c_to_t, g_to_a, c_to_c :: Word8
    c_to_t = 7
    g_to_a = 8
    c_to_c = 5

    isNotSubst x = x < 16 && x `shiftR` 2 == x .&. 3
    isOrdinarySubst x = x < 16 && x `shiftR` 2 /= x .&. 3 &&
                        x /= c_to_t && x /= g_to_a

    ct_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_t && dmg_omega v) seqs
    cc_at_alpha = V.length $ V.filter (\v -> v U.! 0 == c_to_c && dmg_omega v) seqs
    ct_at_beta  = V.length $ V.filter (\v -> v U.! 1 == c_to_t && dmg_omega v) seqs
    cc_at_beta  = V.length $ V.filter (\v -> v U.! 1 == c_to_c && dmg_omega v) seqs

    dmg_omega v = v U.! (l-1) == c_to_t || v U.! (l-1) == g_to_a
               || v U.! (l-2) == c_to_t || v U.! (l-2) == g_to_a
               || v U.! (l-3) == c_to_t || v U.! (l-3) == g_to_a
        where l = U.length v

    l_lam = isigmoid2 lambda
    lambda = min 0.9 $ max 0.1 $
                (ct_at_beta * (cc_at_alpha + ct_at_alpha)) /%/
                ((cc_at_beta + ct_at_beta) * ct_at_alpha)

    ct_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_t && dmg_alpha v) seqs
    cc_at_omega = V.length $ V.filter (\v -> v U.! (U.length v -1) == c_to_c && dmg_alpha v) seqs
    ct_at_psi   = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_t && dmg_alpha v) seqs
    cc_at_psi   = V.length $ V.filter (\v -> v U.! (U.length v -2) == c_to_c && dmg_alpha v) seqs

    dmg_alpha v = v U.! 0 == c_to_t || v U.! 1 == c_to_t || v U.! 2 == c_to_t

    l_kap = isigmoid2 $ min 0.9 $ max 0.1 $
                (ct_at_psi * (cc_at_omega+ct_at_omega)) /%/
                ((cc_at_psi+ct_at_psi) * ct_at_omega)

    total_inner_CCs = V.sum $ V.map (U.length . U.filter (== c_to_c) . takeInner) seqs
    total_inner_CTs = V.sum $ V.map (U.length . U.filter (== c_to_t) . takeInner) seqs
    takeInner v = U.slice 5 (U.length v - 10) v

    delta = (total_inner_CTs /%/ (total_inner_CTs+total_inner_CCs))
    raw_rate = ct_at_alpha /%/ (ct_at_alpha + cc_at_alpha)

    -- clamping is necessary if f_endo ends up wrong
    l_delta = isigmoid2 $ min 0.99 delta
    l_sigma = isigmoid2 . min 0.99 $ raw_rate / lambda


class Memorable a where
    type Memo a :: *

    fromListN :: Int -> [a] -> Memo a
    bang :: Memo a -> Int -> a

instance Memorable Double where
    type Memo Double = U.Vector Double

    fromListN = U.fromListN
    bang = (U.!)

instance Memorable AD where
    type Memo AD = (Int, U.Vector Double)

    fromListN n xs@(D _ v:_) = (1+d, U.fromListN (n * (1+d)) $ concatMap unpack xs)
      where
        !d = U.length v
        unpack (C a)    = a : replicate d 0
        unpack (D a da) = a : U.toList da

    bang (d, v) i = D (v U.! (d*i+0)) (U.slice (d*i+1) (d-1) v)