{-# LANGUAGE DeriveGeneric #-}
module Bio.Genocall where

import Bio.Adna
import Bio.Bam.Pileup
import Bio.Prelude
import Data.Aeson

import qualified Data.HashMap.Strict    as H
import qualified Data.Set               as Set
import qualified Data.Vector            as V
import qualified Data.Vector.Unboxed    as U

-- | Simple indel calling.  We don't bother with it too much, so here's
-- the gist:  We collect variants (simply different variants, details
-- don't matter), so \(n\) variants give rise to \((n+1)*n/2\) GL values.
-- (That's two out of \((n+1)\), the reference allele, represented here as
-- no deletion and no insertion, is there, too.)  To assign these, we
-- need a likelihood for an observed variant given an assumed genotype.
--
-- For variants of equal length, the likelihood is the sum of qualities
-- of mismatching bases, but no higher than the mapping quality.  That
-- is roughly the likelihood of getting the observed sequence even
-- though the real sequence is a different variant.  For variants of
-- different length, the likelihood is the map quality.  This
-- corresponds to the assumption that indel errors in sequencing are
-- much less likely than mapping errors.  Since this is hardly our
-- priority, the approximations are hereby declared good enough.

{-# INLINE simple_indel_call #-}
simple_indel_call :: (DmgToken -> Int -> Bool -> Mat44D) -> (IndelPile,IndelPile) -> (GL, [IndelVariant])
simple_indel_call get_dmg (varsF,varsR)
    | length (varsF++varsR) <= 1 = ( U.empty, [] )
    | otherwise                  = ( simple_call $ map (mkpls False) varsF ++ map (mkpls True) varsR, vars' )
  where
    vars' = IndelVariant (V_Nucs U.empty) (V_Nuc U.empty) :
            (Set.toList . Set.fromList)
                [ IndelVariant (V_Nucs $ U.fromList d)
                               (V_Nuc  $ U.fromList $ map db_call i)
                | (_q,(d,i)) <- varsF ++ varsR
                , not (null d) || not (null i) ]

    match str = zipWith $ \(DB b q dt di _) n -> let p  = get_dmg dt di str `bang` n :-> b
                                                     p' = fromQual q
                                                 in toProb $ p + p' - p * p'

    mkpls :: Bool -> (Qual, ([Nucleotides], [DamagedBase])) -> U.Vector Prob
    mkpls str (q,(d,i)) = U.fromList [ qualToProb q +
                                       if length d /= U.length dr || length i /= U.length ir
                                       then 0 else product (match str i $ U.toList ir)
                                     | IndelVariant (V_Nucs dr) (V_Nuc ir) <- vars' ]

-- | A completely universal, completely empirical substituion model.
-- We make no attempt to distinguish damage from error.  The model is
-- cloned so we don't need to constantly flip matrices depending on
-- strand.
data SubstModel_ m = SubstModel
        { left_substs_fwd   :: {-# UNPACK #-} !(V.Vector m)
        , middle_substs_fwd ::                          !m
        , right_substs_fwd  :: {-# UNPACK #-} !(V.Vector m)
        , left_substs_rev   :: {-# UNPACK #-} !(V.Vector m)
        , middle_substs_rev ::                          !m
        , right_substs_rev  :: {-# UNPACK #-} !(V.Vector m) }
    deriving (Show, Generic)

instance ToJSON   m => ToJSON   (SubstModel_ m)
instance FromJSON m => FromJSON (SubstModel_ m)

type SubstModel = SubstModel_ Mat44D

-- | Mutable version of SubstModel, we'll probably have to accumulate in
-- this thing.
type MSubstModel = SubstModel_ MMat44D

lookupSubstModel :: SubstModel_ a -> Int -> Bool -> a
lookupSubstModel m i False
    | i >= 0 &&   i  <  V.length  (left_substs_fwd m) = V.unsafeIndex (left_substs_fwd   m)   i
    | i <  0 && (-i) <= V.length (right_substs_fwd m) = V.unsafeIndex (right_substs_fwd  m) (-i-1)
    | otherwise                                       = middle_substs_fwd m
lookupSubstModel m i True
    | i >= 0 &&   i  <  V.length  (left_substs_rev m) = V.unsafeIndex (left_substs_rev   m)   i
    | i <  0 && (-i) <= V.length (right_substs_rev m) = V.unsafeIndex (right_substs_rev  m) (-i-1)
    | otherwise                                       = middle_substs_rev m

-- Freezes a mutable substitution model into an immutable one.  Both
-- strands are combined, the result is normalized, and duplicated to
-- have a model for each strand again.
freezeSubstModel :: MSubstModel -> IO SubstModel
freezeSubstModel mm = do
    new_left   <- V.zipWithM freezeMats (left_substs_fwd   mm) (right_substs_rev  mm)
    new_middle <-            freezeMats (middle_substs_fwd mm) (middle_substs_rev mm)
    new_right  <- V.zipWithM freezeMats (right_substs_fwd  mm) (left_substs_rev   mm)

    return $ SubstModel new_left new_middle new_right
                        ( V.map complMat new_left   )
                              ( complMat new_middle )
                        ( V.map complMat new_right  )

newtype SubstModels = SubstModels (HashMap Bytes SubstModel)
  deriving (Show, Generic)

instance ToJSON SubstModels where
    toJSON (SubstModels m) = Object $ H.fromList
        [ ( decodeBytes k, toJSON v ) | (k,v) <- H.toList m ]

instance FromJSON SubstModels where
    parseJSON = withObject "map of substitution models" $ \o ->
                SubstModels . H.fromList <$> sequence
                    [ (,) (encodeBytes k) <$> parseJSON v | (k,v) <- H.toList o ]


-- | Naive SNP call; essentially the GATK model.  We compute the
-- likelihood for each base from an empirical error/damage model, then
-- hand over to 'simple_call'.  Base quality is ignored, but map quality
-- is incorporated.

{-# INLINE simple_snp_call #-}
simple_snp_call :: (DmgToken -> Int -> Bool -> Mat44D) -> (BasePile,BasePile) -> Snp_GLs
simple_snp_call get_dmg (varsF,varsR) = mk_snp_gls (simple_call $ map (mkpls False) varsF ++ map (mkpls True) varsR) ref
  where
    ref = case varsF ++ varsR of (_, DB _ _ _ _ r) : _ -> r ; _ -> nucsN
    mkpls str (qq, DB b _ dt di _) = U.generate 4 $ \n ->
                                        let x = get_dmg dt di str `bang` N (fromIntegral n) :-> b
                                        in toProb $ x + fromQual qq * (1-x)

-- | Compute @GL@ values for the simple case.  The simple case is where
-- we sample two alleles with equal probability and assume that errors
-- occur independently from each other.  This is specialized for a few
-- common cases:  two variants, because that's a typical indel; four
-- variants, because that's every SNP.

{-# INLINE simple_call #-}
simple_call :: [U.Vector Prob] -> GL
simple_call [      ]                    = U.empty
simple_call (gl:gls) = case U.length gl of
    2 -> foldl' (U.zipWith (*)) (step2 gl) $ map step2 gls
              where
                step2 v = U.fromListN 3 [ x0, (x0+x1) / 2, x1 ]
                  where x0 = U.unsafeIndex v 0
                        x1 = U.unsafeIndex v 1

    4 -> foldl' (U.zipWith (*)) (step4 gl) $ map step4 gls
              where
                step4 v = U.fromListN 10 [ x0
                                         , (x0+x1)/2, x1
                                         , (x0+x2)/2, (x1+x2)/2, x2
                                         , (x0+x3)/2, (x1+x3)/2, (x2+x3)/2, x3 ]
                  where x0 = U.unsafeIndex v 0
                        x1 = U.unsafeIndex v 1
                        x2 = U.unsafeIndex v 2
                        x3 = U.unsafeIndex v 3

    _ -> foldl' (U.zipWith (*)) (step gl) $ map step gls
              where
                step !ls = U.concatMap (\i -> let hd  = U.unsafeIndex ls i
                                                  ls' = U.unsafeTake (i+1) ls
                                              in U.map (\x -> 0.5 * (hd + x)) ls'
                                       ) (U.enumFromN 0 $ U.length ls)


-- | Make a list of genotypes, each represented as a vector of allele
-- probabilities, from four possible alleles.
--
-- This makes the most sense for SNPs.  The implied order of alleles is
-- A,C,G,T, and the resulting genotype vectors can straight forwardly be
-- mutiplied with a substitution matrix to give a sensible result.
-- (Something similar for indels could be imagined, but doesn't seem all
-- that useful.  We specialize for SNPs to get simpler types and
-- efficient code.)
--
-- "For biallelic sites the ordering is: AA,AB,BB; for triallelic
-- sites the ordering is: AA,AB,BB,AC,BC,CC, etc."

mk_snp_gts :: [Vec4D]
mk_snp_gts = [ Vec4D (0.5*(a+w)) (0.5*(b+x)) (0.5*(c+y)) (0.5*(d+z))
             | as@(_:_) <- inits [ Vec4D 1 0 0 0, Vec4D 0 1 0 0, Vec4D 0 0 1 0, Vec4D 0 0 0 1 ]
             , let Vec4D a b c d = last as
             , Vec4D w x y z <- as ]


getRow :: Int -> Mat44D -> Vec4D
getRow i (Mat44D v) = Vec4D (v U.! (4*i)) (v U.! (4*i+1)) (v U.! (4*i+2)) (v U.! (4*i+3))

setRow :: Int -> Vec4D -> Mat44D -> Mat44D
setRow i (Vec4D a b c d) (Mat44D v) = Mat44D $ v U.// [ (4*i,a), (4*i+1,b), (4*i+2,c), (4*i+3,d) ]


type Calls = Pile' Snp_GLs (GL, [IndelVariant])

-- | This pairs up GL values and the reference allele.  When
-- constructing it, we make sure the GL values are in the correct order
-- if the reference allele is listed first.
data Snp_GLs = Snp_GLs { snp_gls :: !GL, snp_refbase :: !Nucleotides }
    deriving Show

mk_snp_gls :: GL -> Nucleotides -> Snp_GLs
mk_snp_gls gl ref | U.length gl /= 10 = error "only diploid genomes are supported!"
                  | otherwise         = Snp_GLs gl ref

data Vec4D = Vec4D {-# UNPACK #-} !Double {-# UNPACK #-} !Double {-# UNPACK #-} !Double {-# UNPACK #-} !Double

vecNucs :: (Nucleotide -> Double) -> Vec4D
vecNucs f = Vec4D (f nucA) (f nucC) (f nucG) (f nucT)

vecSum :: Vec4D -> Double
vecSum (Vec4D a b c d)  = a + b + c + d

dot :: Vec4D -> Vec4D -> Double
dot (Vec4D a b c d) (Vec4D w x y z) = a*w + b*x + c*y + d*z

multmv :: Mat44D -> Vec4D -> Vec4D
multmv m v = Vec4D (dot (getRow 0 m) v) (dot (getRow 1 m) v)
                   (dot (getRow 2 m) v) (dot (getRow 3 m) v)

vecZip :: (Double -> Double -> Double) -> Vec4D -> Vec4D -> Vec4D
vecZip f (Vec4D a b c d) (Vec4D w x y z) = Vec4D (f a w) (f b x) (f c y) (f d z)

vecZipNucs :: (Double -> Nucleotide -> Double) -> Vec4D -> Vec4D
vecZipNucs f (Vec4D a b c d) = Vec4D (f a nucA) (f b nucC) (f c nucG) (f d nucT)