{-# LANGUAGE BangPatterns, RecordWildCards, FlexibleContexts #-}
module Bio.Adna (
    DmgStats(..),
    damagePatternsIter,
    damagePatternsIterMD,
    damagePatternsIter2Bit,

    DamageParameters(..),
    DamageModel,
    bang,
    Alignment(..),
    FragType(..),
    To(..),
    NPair,

    noDamage,
    univDamage,
    memoDamageModel,
    Mat44,
    Mat44D
                ) where

import Bio.Bam
import Bio.Base
import Bio.TwoBit
import Control.Arrow ( (***) )
import Control.Monad
import Data.Bits ( xor )
import Data.Vec ( Mat44, Mat44D, identity, getElem, Vec4, (:.)((:.)) )

import qualified Data.Vector.Generic            as G
import qualified Data.Vector                    as V
import qualified Data.Vector.Storable           as VS
import qualified Data.Vector.Unboxed            as U
import qualified Data.Vector.Unboxed.Mutable    as UM

-- ^ Things specific to ancient DNA, e.g. damage models.
--
-- For aDNA, we need a substitution probability.  We have three options:
-- use an empirically determined PSSM, use an arithmetically defined
-- PSSM based on the /Johnson/ model, use a context sensitive PSSM based
-- on the /Johnson/ model and an alignment.  Using /Dindel/, actual
-- substitutions relative to a called haplotype would be taken into
-- account.  Since we're not going to do that, taking alignments into
-- account is difficult, somewhat approximate, and therefore not worth
-- the hassle.
--
-- We represent substitution matrices by the type 'Mat44D'.  Internally,
-- this is a vector of packed vectors.  Conveniently, each of the packed
-- vectors represents all transition /into/ the given nucleotide.


-- | A 'DamageModel' is a function that gives substitution matrices for
-- each position in a read.  The 'DamageModel' can depend on the length
-- of the read and whether its alignment is reversed.  In practice, we
-- should probably memoize precomputed damage models somehow.

type DamageModel a = Bool -> Int -> V.Vector (Mat44 a)
data To = Nucleotide :-> Nucleotide

infix 9 :->
infix 8 `bang`

-- | Convenience function to access a substitution matrix that has a
-- mnemonic reading.
{-# INLINE bang #-}
bang :: Mat44D -> To -> Double
bang m (N x :-> N y) = getElem (fromIntegral x) $ getElem (fromIntegral y) m

-- | 'DamageModel' for undamaged DNA.  The likelihoods follow directly
-- from the quality score.  This needs elaboration to see what to do
-- with amibiguity codes (even though those haven't actually been
-- observed in the wild).

{-# SPECIALIZE noDamage :: DamageModel Double #-}
noDamage :: Num a => DamageModel a
noDamage _ l = V.replicate l identity


-- | Parameters for the universal damage model.
--
-- We assume the correct model is either no damage, or single strand
-- damage, or double strand damage.  Each of them comes with a
-- probability.  It turns out that blending them into one is simply
-- accomplished by multiplying these probabilities onto the deamination
-- probabilities.
--
-- For single stranded library prep, only one kind of damage occurs (C
-- to T), it occurs at low frequency ('ssd_delta') everywhere, at high
-- frequency ('ssd_sigma') in single stranded parts, and the overhang
-- length is distributed exponentially with parameter 'ssd_lambda' at
-- the 5' end and 'ssd_kappa' at the 3' end.  (Without UDG treatment,
-- those will be equal.  With UDG, those are much smaller and in fact
-- don't literally represent overhangs.)
--
-- For double stranded library prep, we get C->T damage at the 5' end
-- and G->A at the 3' end with rate 'dsd_sigma' and both in the interior
-- with rate 'dsd_delta'.  Everything is symmetric, and therefore the
-- orientation of the aligned read doesn't matter either.  Both
-- overhangs follow a distribution with parameter 'dsd_lambda'.

data DamageParameters float = DP { ssd_sigma  :: !float         -- deamination rate in ss DNA, SS model
                                 , ssd_delta  :: !float         -- deamination rate in ds DNA, SS model
                                 , ssd_lambda :: !float         -- param for geom. distribution, 5' end, SS model
                                 , ssd_kappa  :: !float         -- param for geom. distribution, 3' end, SS model
                                 , dsd_sigma  :: !float         -- deamination rate in ss DNA, DS model
                                 , dsd_delta  :: !float         -- deamination rate in ds DNA, DS model
                                 , dsd_lambda :: !float }       -- param for geom. distribution, DS model
  deriving (Read, Show)

-- | Generic substitution matrix, has C->T and G->A deamination as
-- parameters.  Setting 'p' or 'q' to 0 as appropriate makes this apply
-- to the single stranded or undamaged case.

{-# INLINE genSubstMat #-}
genSubstMat :: Fractional a => a -> a -> Mat44 a
genSubstMat p q = vec4 ( vec4  1   0     q   0 )
                       ( vec4  0 (1-p)   0   0 )
                       ( vec4  0   0   (1-q) 0 )
                       ( vec4  0   p     0   1 )
  where
    vec4 :: a -> a -> a -> a -> Vec4 a
    vec4 a b c d = a :. b :. c :. d :. ()

memoDamageModel :: DamageModel a -> DamageModel a
memoDamageModel f = \r l -> if l > 512 || l < 0 then f r l
                            else if r then V.unsafeIndex rev l
                            else           V.unsafeIndex fwd l
  where
    rev = V.generate 512 $ f True
    fwd = V.generate 512 $ f False

{-# SPECIALIZE univDamage :: DamageParameters Double -> DamageModel Double #-}
univDamage :: Fractional a => DamageParameters a -> DamageModel a
univDamage DP{..} r l = V.generate l mat
  where
    mat i = genSubstMat (p1+p2) (q1+q2)
      where
        (p1, q1) = if r then let lam5 = ssd_lambda ^ (l-i)
                                 lam3 = ssd_kappa ^ (1+i)
                                 lam  = lam3 + lam5 - lam3 * lam5
                                 p    = ssd_sigma * lam + ssd_delta * (1-lam)
                             in (0,p)
                        else let lam5 = ssd_lambda ^ (1+i)
                                 lam3 = ssd_kappa ^ (l-i)
                                 lam  = lam3 + lam5 - lam3 * lam5
                                 p    = ssd_sigma * lam + ssd_delta * (1-lam)
                             in (p,0)

        p2      = dsd_sigma * lam5_ds + dsd_delta * (1-lam5_ds)
        q2      = dsd_sigma * lam3_ds + dsd_delta * (1-lam3_ds)
        lam5_ds = dsd_lambda ^ (1+i)
        lam3_ds = dsd_lambda ^ (l-i)


-- | Collected \"traditional\" statistics:
--
-- * Base composition near 5' end and near 3' end.  Each consists of
-- five vectors of counts of A,C,G,T, and everything else.  'basecompo5'
-- begins with 'context' bases to the left of the 5' end, 'basecompo3'
-- ends with 'context' bases to the right of the 3' end.
--
-- * Substitutions.  Counted from the reconstructed alignment, once
-- around the 5' end and once around the 3' end.  For a total of 2*4*4
-- different substitutions.
--
-- * Substitutions at CpG motivs.  Also counted from the reconstructed
-- alignment, and a CpG site is simply the sequence CG in the reference.
-- Gaps may interfere with that, but that might actually be desirable.
--
-- * Conditional substitutions.  Need an exact definition, and then a
-- couple additional tables.
--
-- XXX  This got kind of ugly.  We'll see where this goes...

data DmgStats a = DmgStats {
    basecompo5 :: [( Maybe Nucleotide, U.Vector Int )],
    basecompo3 :: [( Maybe Nucleotide, U.Vector Int )],
    substs5    :: [( To, U.Vector Int )],
    substs3    :: [( To, U.Vector Int )],
    substs5d5  :: [( To, U.Vector Int )],
    substs3d5  :: [( To, U.Vector Int )],
    substs5d3  :: [( To, U.Vector Int )],
    substs3d3  :: [( To, U.Vector Int )],
    substs5dd  :: [( To, U.Vector Int )],
    substs3dd  :: [( To, U.Vector Int )],
    substs5cpg :: [( To, U.Vector Int )],
    substs3cpg :: [( To, U.Vector Int )],
    stats_more :: a }

data FragType = Complete | Leading | Trailing deriving (Show, Eq)
type NPair = ( Nucleotides, Nucleotides )

-- Alignment record, might have been gotten from practically anywhere
-- with varying completeness.  We record anything we can get, most is
-- optional.  Reference sequence is filled with Ns if missing.
data Alignment = ALN
    { a_sequence :: !(U.Vector NPair)       -- the alignment proper
    , a_fragment_type :: !FragType }    -- was the adapter trimmed?


-- | Enumeratee (almost) that computes some statistics from plain BAM
-- (no MD field needed) and a 2bit file.  The 'Alignment' is also
-- reconstructed and passed downstream.  The result of any downstream
-- processing is available in the 'stats_more' field of the result.
--
-- * Get the reference sequence including both contexts once.  If this
--   includes invalid sequence (negative coordinate), pad suitably.
-- * Accumulate counts for the valid parts around 5' and 3' ends as
--   appropriate from flags and config.
-- * Combine the part that was aligned to (so no context) with the read
--   to reconstruct the alignment.
--
-- Arguments are the table of reference names, the 2bit file with the
-- reference, the amount of context outside the alignment desired, the
-- amount of context inside desired, and whether to compensate for
-- leeHom breakage.
--
-- For 'Complete' fragments, we cut the read in the middle, so the 5'
-- and 3' plots stay clean from each other's influence.  'Leading' and
-- 'Trailing' fragments count completely towards the appropriate end.

damagePatternsIter2Bit :: MonadIO m
                       => Refs -> TwoBitFile -> Int -> Int -> Bool
                       -> Iteratee [Alignment] m b
                       -> Iteratee [BamRaw] m (DmgStats b)
damagePatternsIter2Bit refs tbf ctx rng =
    damagePatternsIter ctx rng $ \br ->
        let b@BamRec{..} = unpackBam br
            ref_nm = sq_name $ getRef refs b_rname
            ref    = getFragment tbf ref_nm (b_pos - ctx) (alignedLength b_cigar + 2*ctx)
            pairs  = aln_from_ref (U.drop ctx ref) b_seq b_cigar
        in if isUnmapped b
           then Nothing
           else Just (b, ref, pairs)


-- | Enumeratee (almost) that computes some statistics from plain BAM
-- with a valid MD field.  The 'Alignment' is also reconstructed and
-- passed downstream.  The result of any downstream processing is
-- available in the 'stats_more' field of the result.
--
-- * Reconstruct the alignment from CIGAR, SEQ, and MD.
-- * Filter the alignment to get the reference sequence, accumulate it.
-- * Accumulate everything over the alignment.
--
-- Arguments are just the amount of context inside desired.
-- Arguments are the amount of context inside desired, and whether to
-- compensate for leeHom breakage.
--
-- For 'Complete' fragments, we cut the read in the middle, so the 5'
-- and 3' plots stay clean from each other's influence.  'Leading' and
-- 'Trailing' fragments count completely towards the appropriate end.

damagePatternsIterMD :: MonadIO m
                     => Int -> Bool
                     -> Iteratee [Alignment] m b
                     -> Iteratee [BamRaw] m (DmgStats b)
damagePatternsIterMD rng =
    damagePatternsIter 0 rng $ \br -> do
        let b@BamRec{..} = unpackBam br
        guard (not $ isUnmapped b)
        md <- getMd b
        let pairs = aln_from_md b_seq b_cigar md
            ref   = U.map fromN $ U.filter ((/=) gap . fst) pairs
        return (b, ref, pairs)
  where
    fromN (ns,_) | ns == nucsA = 2
                 | ns == nucsC = 1
                 | ns == nucsG = 3
                 | ns == nucsT = 0
                 | otherwise   = 4

-- | Common logic for statistics. The function 'get_ref_and_aln'
-- reconstructs reference sequence and alignment from a Bam record.  It
-- is expected to construct the alignment with respect to the forwards
-- strand of the reference; we reverse-complement it if necessary.
damagePatternsIter :: MonadIO m
                   => Int -> Int
                   -> (BamRaw -> Maybe (BamRec, U.Vector Word8, U.Vector NPair))
                   -> Bool -> Iteratee [Alignment] m b
                   -> Iteratee [BamRaw] m (DmgStats b)
damagePatternsIter ctx rng get_ref_and_aln leeHom it = do
    let maxwidth = ctx + rng
    acc_bc <- liftIO $ UM.replicate (2 * 5 *    maxwidth) (0::Int)
    acc_st <- liftIO $ UM.replicate (2 * 4 * 4 * 4 * rng) (0::Int)
    acc_cg <- liftIO $ UM.replicate (2 * 2 * 4 *     rng) (0::Int)

    let do_bc br = case fmap revcom_both $ get_ref_and_aln br of
            Nothing                         -> return []
            Just (b@BamRec{..}, ref, a_sequence) -> liftIO $ do

              let good_pairs     = U.indexed             a_sequence
                  good_pairs_rev = U.indexed $ U.reverse a_sequence
                  a_fragment_type | isFirstMate  b && isPaired     b = Leading
                                  | isSecondMate b && isPaired     b = Trailing
                                  | leeHom                           = Complete
                                  | isFirstMate  b || isSecondMate b = Complete     -- old style flagging
                                  | isTrimmed    b || isMerged     b = Complete     -- new style flagging
                                  | otherwise                        = Leading

              -- basecompositon near 5' end, near 3' end
              let width  = ctx + min rng (alignedLength b_cigar `div` 2)
              mapM_ (\i -> bump (fromIntegral (ref U.!  i                   ) * maxwidth + i) acc_bc) [0 .. width-1]
              mapM_ (\i -> bump (fromIntegral (ref U.! (i + U.length ref) +6) * maxwidth + i) acc_bc) [-width .. -1]

              -- For substitutions, decide what damage class we're in:
              -- 0 - no damage, 1 - damaged 5' end, 2 - damaged 3' end, 3 - both
              let dmgbase = 2*4*4*rng *
                              ( (if U.null a_sequence || U.head a_sequence /= (nucsC,nucsT) then 1 else 0)
                              + (if U.null a_sequence || U.last a_sequence /= (nucsC,nucsT) then 2 else 0) )

              -- substitutions near 5' end
              let len_at_5 = case a_fragment_type of Leading  -> min rng (G.length b_seq)
                                                     Complete -> min rng (G.length b_seq `div` 2)
                                                     Trailing -> 0
              U.forM_ (U.take len_at_5 good_pairs) $
                    \(i,uv) -> withPair uv $ \j -> bump (j * rng + i + dmgbase) acc_st

              -- substitutions at CpG sites near 5' end
              U.zipWithM_
                  (\(i,(u,v)) (_,(w,z)) ->
                      when (u == nucsC && w == nucsG) $ do
                          withNs v $ \y -> bump (  y   * rng +  i ) acc_cg
                          withNs z $ \y -> bump ((y+4) * rng + i+1) acc_cg)
                  (U.take len_at_5 good_pairs) (U.drop 1 good_pairs)

              -- substitutions near 3' end
              let len_at_3 = case a_fragment_type of Leading  -> 0
                                                     Complete -> min rng (G.length b_seq `div` 2)
                                                     Trailing -> min rng (G.length b_seq)
              U.forM_ (U.take len_at_3 good_pairs_rev) $
                    \(i,uv) -> withPair uv $ \j -> bump ((17+j) * rng -i -1 + dmgbase) acc_st

              -- substitutions at CpG sites near 3' end
              U.zipWithM_
                  (\(_,(u,v)) (i,(w,z)) ->
                      when (u == nucsC && w == nucsG) $ do
                          withNs v $ \y -> bump ((y+ 9) * rng - i-2) acc_cg
                          withNs z $ \y -> bump ((y+13) * rng - i-1) acc_cg)
                  (U.drop 1 good_pairs_rev) (U.take len_at_3 good_pairs_rev)


              return [ ALN{..} ]
          where
            {-# INLINE withPair #-}
            withPair (Ns u, Ns v) k = case pairTab `U.unsafeIndex` fromIntegral (16*u+v) of
                    j -> if j >= 0 then k j else return ()

            !pairTab = U.replicate 256 (-1) U.//
                       [ (fromIntegral $ 16*u+v, x*4+y) | (Ns u,x) <- zip [nucsA, nucsC, nucsG, nucsT] [0,1,2,3]
                                                        , (Ns v,y) <- zip [nucsA, nucsC, nucsG, nucsT] [0,1,2,3] ]

            {-# INLINE bump #-}
            bump i v = UM.unsafeRead v i >>= UM.unsafeWrite v i . succ

            {-# INLINE withNs #-}
            withNs ns k | ns == nucsA = k 0
                        | ns == nucsC = k 1
                        | ns == nucsG = k 2
                        | ns == nucsT = k 3
                        | otherwise   = return ()


    it'  <- concatMapStreamM do_bc it

    let nsubsts = 2*4*4*rng
        mk_substs off = sequence [ (,) (n1 :-> n2) <$> U.unsafeFreeze (UM.slice ((4*i+j)*rng + off*nsubsts) rng acc_st)
                                 | (i,n1) <- zip [0..] [nucA..nucT]
                                 , (j,n2) <- zip [0..] [nucA..nucT] ]

    accs <- liftIO $ DmgStats <$> sequence [ (,) nuc <$> U.unsafeFreeze (UM.slice (i*maxwidth) maxwidth acc_bc)
                                           | (i,nuc) <- zip [2,1,3,0,4] [Just nucA, Just nucC, Just nucG, Just nucT, Nothing] ]
                              <*> sequence [ (,) nuc <$> U.unsafeFreeze (UM.slice (i*maxwidth) maxwidth acc_bc)
                                           | (i,nuc) <- zip [7,6,8,5,9] [Just nucA, Just nucC, Just nucG, Just nucT, Nothing] ]

                              <*> mk_substs 0
                              <*> mk_substs 1
                              <*> mk_substs 2
                              <*> mk_substs 3
                              <*> mk_substs 4
                              <*> mk_substs 5
                              <*> mk_substs 6
                              <*> mk_substs 7

                              <*> sequence [ (,) (n1 :-> n2) <$> U.unsafeFreeze (UM.slice ((i+j)*rng) rng acc_cg)
                                           | (i,n1) <- [(0,nucC), (4,nucG)]
                                           , (j,n2) <- zip [0..] [nucA..nucT] ]

                              <*> sequence [ (,) (n1 :-> n2) <$> U.unsafeFreeze (UM.slice ((i+j)*rng) rng acc_cg)
                                           | (i,n2) <- [(8,nucC), (12,nucG)]
                                           , (j,n1) <- zip [0..] [nucA..nucT] ]

    accs' <- accs `liftM` lift (run it')
    let vsum = foldl1 (zipWith s1) . map ($ accs')
        s1 (x :-> y, u) (z :-> w, v) | x == z && y == w = (x :-> y, U.zipWith (+) u v)
                                     | otherwise = error $ "Mismatch in zip.  This is a bug."

    return $ accs' { substs5 = vsum [ substs5, substs5d5, substs5d3, substs5dd ]
                   , substs3 = vsum [ substs3, substs3d5, substs3d3, substs3dd ]
                   , substs5d5 = vsum [ substs5d5, substs5dd ]
                   , substs3d5 = vsum [ substs3d5, substs3dd ]
                   , substs5d3 = vsum [ substs5d3, substs5dd ]
                   , substs3d3 = vsum [ substs3d3, substs3dd ] }


revcom_both :: ( BamRec, U.Vector Word8, U.Vector (Nucleotides, Nucleotides) )
            -> ( BamRec, U.Vector Word8, U.Vector (Nucleotides, Nucleotides) )
revcom_both (b, ref, pairs)
    | isReversed b = ( b, revcom_ref ref, revcom_pairs pairs )
    | otherwise    = ( b,            ref,              pairs )
  where
    revcom_ref   = U.reverse . U.map (xor 2)
    revcom_pairs = U.reverse . U.map (compls *** compls)


-- | Reconstructs the alignment from reference, query, and cigar.  Only
-- positions where the query is not gapped are produced.
aln_from_ref :: U.Vector Word8 -> Vector_Nucs_half Nucleotides -> VS.Vector Cigar -> U.Vector NPair
aln_from_ref ref0 qry0 cig0 = U.fromList $ step ref0 qry0 cig0
  where
    step ref qry cig1
        | U.null ref || G.null qry || G.null cig1 = []
        | otherwise = case G.unsafeHead cig1 of { op :* n ->
                      case G.unsafeTail cig1 of { cig ->
                      case op of {

        Mat -> zipWith (\r q -> (nn r,q)) (G.toList (G.take n ref))
                                          (G.toList (G.take n qry)) ++ step (G.drop n ref) (G.drop n qry) cig ;
        Del ->                                                         step (G.drop n ref)           qry  cig ;
        Ins ->    map (\q -> ( gap,  q )) (G.toList (G.take n qry)) ++ step           ref  (G.drop n qry) cig ;
        SMa ->    map (\q -> ( gap,  q )) (G.toList (G.take n qry)) ++ step           ref  (G.drop n qry) cig ;
        HMa ->   replicate n (gap, nucsN)                           ++ step           ref            qry  cig ;
        Nop ->                                                         step           ref            qry  cig ;
        Pad ->                                                         step           ref            qry  cig }}}

    nn 0 = nucsT
    nn 1 = nucsC
    nn 2 = nucsA
    nn 3 = nucsG
    nn _ = nucsN


-- | Reconstructs the alignment from query, cigar, and md.  Only
-- positions where the query is not gapped are produced.
aln_from_md :: Vector_Nucs_half Nucleotides -> VS.Vector Cigar -> [MdOp] -> U.Vector NPair
aln_from_md qry0 cig0 md0 = U.fromList $ step qry0 cig0 md0
  where
    step qry cig1 md
        | G.null qry || G.null cig1 || null md = []
        | otherwise = case G.unsafeHead cig1 of op :* n -> step' qry op n (G.unsafeTail cig1) md

    step' qry  _ 0 cig             md  = step  qry      cig md
    step' qry op n cig (MdNum  0 : md) = step' qry op n cig md
    step' qry op n cig (MdDel [] : md) = step' qry op n cig md

    step' qry Mat n cig (MdNum m : md)
            | n <  m = map (\q -> (q,q)) (G.toList (G.take n qry)) ++ step  (G.drop n qry)           cig (MdNum (m-n) : md)
            | n >  m = map (\q -> (q,q)) (G.toList (G.take m qry)) ++ step' (G.drop m qry) Mat (n-m) cig                md
            | n == m = map (\q -> (q,q)) (G.toList (G.take n qry)) ++ step  (G.drop n qry)           cig                md
    step' qry Mat n cig (MdRep c : md) =         ( c, G.head qry )  : step' (G.tail   qry) Mat (n-1) cig                md
    step'   _ Mat _   _          _     = []

    step' qry Del n cig (MdDel (_:ss) : md) = step' qry Del (n-1) cig (MdDel ss : md)
    step'   _ Del _   _               _     = []

    step' qry Ins n cig                 md  = map ((,) gap) (G.toList (G.take n qry)) ++ step (G.drop n qry) cig md
    step' qry SMa n cig                 md  = map ((,) gap) (G.toList (G.take n qry)) ++ step (G.drop n qry) cig md
    step' qry HMa n cig                 md  =                replicate n (gap, nucsN) ++ step           qry  cig md
    step' qry Nop _ cig                 md  =                                            step           qry  cig md
    step' qry Pad _ cig                 md  =                                            step           qry  cig md