{-# LANGUAGE BangPatterns, TypeFamilies, TypeOperators, FlexibleInstances, FlexibleContexts, GADTs #-}
module Data.NeuralNetwork.Backend.HMatrix.Layers where

import Numeric.LinearAlgebra hiding (R, C)
import Numeric.LinearAlgebra.Devel
import qualified Data.Vector as V
import qualified Data.Vector.Storable as SV
import qualified Data.Vector.Storable.Mutable as SVM
import System.Random.MWC
import System.Random.MWC.Distributions
import Control.Monad.ST
import Control.Monad (liftM2, forM_, when)
import GHC.Float
import Data.STRef
import Data.Functor.Identity
import Control.DeepSeq
import Data.NeuralNetwork
import Data.NeuralNetwork.Backend.HMatrix.Utils

type R = Float

-- We parameterise the activation layer T, where the parameter indicates how
-- elements are contained:
--   SinglC :. Vector, SinglC :. Matrix, MultiC :. Vector, MultiC :.  Matrix
-- SinglC means the input has only one channel, while
-- MultiC means the input has more than one.
--
-- type function composition
data (f :: * -> *) :. (g :: * -> *) :: * -> *
-- type function: Identity
data SinglC :: * -> *
data MultiC :: * -> *

-- Tags for each form of layer
data F
data C
data A
data M
data T (c :: * -> *)
data S a b

data RunLayer :: * -> * where
  -- Densely connected layer
  -- input:   vector of size m
  -- output:  vector of size n
  -- weights: matrix of size m x n
  -- biases:  vector of size n
  Full :: !(Matrix R) -> !(Vector R) -> RunLayer F
  -- convolutional layer
  -- input:  channels of 2D floats, of the same size (a x b), # of input channels:  m
  -- output: channels of 2D floats, of the same size (c x d), # of output channels: n
  --         where c = a + 2*padding + 1 - s
  --               d = b + 2*padding + 1 - t
  -- feature:  matrix of (s x t), # of features: m x n
  -- padding:  number of 0s padded at each side of channel
  -- biases:   bias for each output, # of biases: n
  Conv  :: !(V.Vector (V.Vector (Matrix R))) -> !(V.Vector R) -> Int -> RunLayer C
  -- Reshape from channels of matrix to a single vector
  -- input:  m channels of 2D matrices
  --         assuming that all matrices are of the same size a x b
  -- output: 1D vector of the concatenation of all input channels
  --         its size: m x a x b
  As1D  :: RunLayer A
  -- max pooling layer
  -- input:  channels of 2D floats, of the same size (a x b), # of input channels:  m
  --         assuming that a and b are both multiple of stride
  -- output: channels of 2D floats, of the same size (c x d), # of output channels: m
  --         where c = a / stride
  --               d = b / stride
  MaxP :: Int -> RunLayer M
  -- Activator
  -- the input can be either a 1D vector, 2D matrix, or channels of either.
  Activation :: (R->R, R->R) -> RunLayer (T c)
  -- stacking two components a and b
  -- the output of a should matches the input of b
  Stack :: !(RunLayer a) -> !(RunLayer b) -> RunLayer (S a b)

instance Component (RunLayer F) where
    type Run (RunLayer F) = Identity
    type Inp (RunLayer F) = Vector R
    type Out (RunLayer F) = Vector R
    -- trace is (input, weighted-sum)
    newtype Trace (RunLayer F) = DTrace (Vector R, Vector R)
    forwardT (Full w b) !inp =
        let !bv = (inp <# w) `add` b
        in return $ DTrace (inp,bv)
    output (DTrace (_,!a)) = a
    backward l (DTrace (!iv,!bv)) !odelta rate =
        let Full w b = l
            !d = scale (negate rate) odelta
            !m = iv `outer` d
            -- back-propagated error at input
            !idelta = w #> odelta
            -- update to weights
            ---- for what reason, could this expression: w `add` (iv `outer` d)
            ---- entails a huge space leak? especially, neither 'seq' nor
            ---- 'deepseq' helps a bit. The only workaround is to expand the
            ---- add function, and call SV.force on the result vector, which
            ---- explcitly copy and drop reference to orignal computed result.
            !w'= w `add` m
            -- !w'= let (r,c) = size w
            --          dat1 = flatten (tr' w)
            --          dat2 = flatten (tr' m)
            --      in matrixFromVector ColumnMajor r c $ SV.force $ dat1 `add` dat2
            !b'= b `add` d
            -- !b'= SV.force $ b `add` d
        in return $ (Full w' b', idelta)

instance Component (RunLayer C) where
    type Run (RunLayer C) = Identity
    type Inp (RunLayer C) = V.Vector (Matrix R)
    type Out (RunLayer C) = V.Vector (Matrix R)
    -- trace is (input, convoluted output)
    newtype Trace (RunLayer C) = CTrace (Inp (RunLayer C), V.Vector (Matrix R))
    forwardT (Conv fs bs p) !inp =
        let !ov = parallel $ V.zipWith feature
                               (tr fs) -- feature matrix indexed majorly by each output
                               bs      -- biases by each output
        in return $ CTrace (inp,ov)
      where
        !osize = let (x,y) = size (V.head inp)
                     (u,v) = size (V.head $ V.head fs)
                 in (x+2*p-u+1, y+2*p-v+1)
        -- transpose the features matrix
        tr :: V.Vector (V.Vector a) -> V.Vector (V.Vector a)
        tr uv = let n   = V.length (V.head uv)
                    !vu = V.map (\i -> V.map (V.! i) uv) $ V.enumFromN 0 n
                in vu
        feature :: V.Vector (Matrix R) -> R -> Matrix R
        feature f b = V.foldl1' add (V.zipWith (layerCorr2 p) f inp) `add` konst b osize
    output (CTrace (_,a)) = a
    backward l (CTrace (!iv,!av)) !odelta rate =
      let Conv fs bs p = l
          -- update to the feature matrix
          m :: V.Vector (V.Vector (Matrix R))
          !m = parallel $ V.zipWith (\flts chn ->
                            -- chn:  a single input channel
                            -- flts: all features used for chn
                            V.zipWith (\f d ->
                              let upd = scale (negate rate) (layerCorr2 p chn d)
                              in f `add` upd
                            ) flts odelta
                          ) fs iv
          -- update to the biases
          b :: V.Vector R
          !b = V.zipWith (\b d -> b + (negate rate) * sumElements d) bs odelta
          -- back-propagated error at input
          idelta :: V.Vector (Matrix R)
          !idelta = V.map (\f -> V.foldl1' add $ V.zipWith (layerConv2 p) f odelta) fs
      in --trace ("CL:" ++ show odelta)
         return $ (Conv m b p, idelta)

instance Component (RunLayer A) where
  type Run (RunLayer A) = Identity
  type Inp (RunLayer A) = V.Vector (Matrix R)
  type Out (RunLayer A) = Vector R
  -- trace keeps information of (m, axb, b, output)
  newtype Trace (RunLayer A) = ReshapeTrace (Int, Int, Int, Vector R)
  forwardT _ !inp =
    let !b = V.length inp
        (!r,!c) = size (V.head inp)
        !o = V.foldr' (\x y -> flatten x SV.++ y) SV.empty inp
    in return $ ReshapeTrace (b, r*c, c, o)
  output (ReshapeTrace (_,_,_,a)) = a
  backward a (ReshapeTrace (b,n,c,_)) !odelta _ =
    let !idelta = V.fromList $ map (reshape c) $ takesV (replicate b n) odelta
    in return $ (a, idelta)

instance Component (RunLayer M) where
  type Run (RunLayer M) = Identity
  type Inp (RunLayer M) = V.Vector (Matrix R)
  type Out (RunLayer M) = V.Vector (Matrix R)
  -- trace is (dimension of pools, index of max in each pool, pooled matrix)
  -- for each channel.
  newtype Trace (RunLayer M) = PTrace (V.Vector (IndexOf Matrix, Vector Int, Matrix R))
  -- forward is to divide the input matrix in stride x stride sub matrices,
  -- and then find the max element in each sub matrices.
  forwardT (MaxP stride) !inp = return $ PTrace $ parallel $ V.map mk inp
    where
      mk inp = let (!i,!v) = pool stride inp in (size v, i, v)
  output (PTrace a) = V.map (\(_,_,!o) ->o) a
  -- use the saved index-of-max in each pool to propagate the error.
  backward l@(MaxP stride) (PTrace t) odelta _ =
      let !idelta = V.zipWith gen t odelta in return $ (l, idelta)
    where
      gen (!si,!iv,_) od = unpool stride iv od

instance (Component (RunLayer a),
          Component (RunLayer b),
          Run (RunLayer a) ~ Identity,
          Run (RunLayer b) ~ Identity,
          Out (RunLayer a) ~ Inp (RunLayer b)
         ) => Component (RunLayer (S a b)) where
    type Run (RunLayer (S a b)) = Identity
    type Inp (RunLayer (S a b)) = Inp (RunLayer a)
    type Out (RunLayer (S a b)) = Out (RunLayer b)
    newtype Trace (RunLayer (S a b)) = TTrace (Trace (RunLayer b), Trace (RunLayer a))
    forwardT (Stack a b) !i = do
        !tra <- forwardT a i
        !trb <- forwardT b (output tra)
        return $ TTrace (trb, tra)
    output (TTrace !a) = output (fst a)
    backward (Stack a b) (TTrace (!trb,!tra)) !odelta rate = do
        (b', !odelta') <- backward b trb odelta  rate
        (a', !idelta ) <- backward a tra odelta' rate
        return (Stack a' b', idelta)

instance (Container c R) => Component (RunLayer (T (MultiC :. c))) where
    type Run (RunLayer (T (MultiC :. c))) = Identity
    type Inp (RunLayer (T (MultiC :. c))) = V.Vector (c R)
    type Out (RunLayer (T (MultiC :. c))) = V.Vector (c R)
    newtype Trace (RunLayer (T (MultiC :. c))) = TTraceM (V.Vector (Trace (RunLayer (T (SinglC :. c)))))
    forwardT (Activation ac) !inp =
      TTraceM <$> V.mapM (forwardT (Activation ac)) inp
    output (TTraceM a) = V.map output a
    backward a@(Activation ac) (TTraceM ts) !odelta r = do
      idelta <- V.zipWithM (\t d -> snd <$> backward (Activation ac) t d r) ts odelta
      return (a, idelta)

instance (Container c R) => Component (RunLayer (T (SinglC :. c))) where
    type Run (RunLayer (T (SinglC :. c))) = Identity
    type Inp (RunLayer (T (SinglC :. c))) = c R
    type Out (RunLayer (T (SinglC :. c))) = c R
    newtype Trace (RunLayer (T (SinglC :. c))) = TTraceS (c R, c R)
    forwardT (Activation (af,_)) !inp = return $ TTraceS (inp, cmap af inp)
    output (TTraceS (_,!a)) = a
    backward a@(Activation (_,ag)) (TTraceS (!iv,_)) !odelta _ = return $ (a, odelta `hadamard` cmap ag iv)

newFLayer :: Int                -- number of input values
          -> Int                -- number of neurons (output values)
          -> IO (RunLayer F)    -- new layer
newFLayer m n =
    withSystemRandom . asGenIO $ \gen -> do
        -- we build the weights in column major because in the back-propagation
        -- algo, the computed update to weights is in column major. So it is
        -- good for performance to keep the matrix always in column major.
        w <- buildMatrix (normal 0 0.01 gen) ColumnMajor (m,n)
        b <- return $ konst 1 n
        return $ Full w b

newCLayer :: Int                -- number of input channels
          -> Int                -- number of output channels
          -> Int                -- size of each feature
          -> Int                -- size of padding
          -> IO (RunLayer C)    -- new layer
newCLayer inpsize outsize sfilter npadding =
  withSystemRandom . asGenIO $ \gen -> do
      fs <- V.replicateM inpsize $ V.replicateM outsize $
              buildMatrix (truncNormal 0 0.1 gen) RowMajor (sfilter, sfilter)
      bs <- return $ V.replicate outsize 0.1
      return $ Conv fs bs npadding
  where
    truncNormal m s g = do
      x <- standard g
      if x >= 2.0 || x <= -2.0
        then truncNormal m s g
        else return $! m + s * x

buildMatrix g order (nr, nc) = do
  vals <- SV.replicateM (nr*nc) (double2Float <$> g)
  return $ matrixFromVector order nr nc vals

layerCorr2 :: Int -> Matrix R -> Matrix R -> Matrix R
layerCorr2 p k m = c_corr2d_s k padded
  where
    padded = zeroPadded p m
    (w,_)  = size k

layerConv2 :: Int -> Matrix R -> Matrix R -> Matrix R
layerConv2 p k m = c_conv2d_s k padded
  where
    padded = zeroPadded p m
    (w,_)  = size k

-- max pool, picking out the maximum element
-- in each stride x stride sub-matrices.
-- assuming that the original matrix row and column size are
-- both multiple of stride
pool :: Int -> Matrix Float -> (Vector Int, Matrix Float)
pool 1 mat = let (r,c) = size mat in (SV.replicate (r*c) 0, mat)
-- pool 2 mat | orderOf mat == RowMajor = c_max_pool2_f mat
pool stride mat = runST $ do
  ori <- unsafeThawMatrix mat
  mxv <- newUndefinedMatrix RowMajor r' c'
  mxi <- newUndefinedVector (r'*c')
  forM_ [0..r'-1] $ \i -> do
    forM_ [0..c'-1] $ \j -> do
      (n,v) <- unsafeMaxIndEle ori (i*stride) (j*stride) stride stride
      unsafeWriteVector mxi (i*c'+j) n
      unsafeWriteMatrix mxv i j v
  a <- unsafeFreezeVector mxi
  b <- unsafeFreezeMatrix mxv
  return (a,b)
  where
    (r,c) = size mat
    r'    = r `div` stride
    c'    = c `div` stride
    unsafeMaxIndEle mm x y r c = do
      mp <- newSTRef 0
      mv <- newSTRef (-10000.0)
      forM_ [0..r-1] $ \ i -> do
        forM_ [0..c-1] $ \ j -> do
          v1 <- unsafeReadMatrix mm (x+i) (y+j)
          v0 <- readSTRef mv
          when (v1 > v0) $ do
            writeSTRef mv v1
            writeSTRef mp (i*2+j)
      p <- readSTRef mp
      v <- readSTRef mv
      return (p, v)

-- the reverse of max pool.
-- assuming idx and mat are of the same size
unpool :: Int -> Vector Int -> Matrix Float -> Matrix Float
unpool stride idx mat = runSTMatrix $ do
  mat' <- newMatrix' 0 r' c'
  forM_ [0..r-1] $ \i -> do
    forM_ [0..c-1] $ \j -> do
      let pos     = idx SV.! (i*c+j)
      let (oi,oj) = pos `divMod` 2
      let val     = mat `atIndex` (i,j)
      unsafeWriteMatrix mat' (i*stride+oi) (j*stride+oj) val
  return mat'
  where
    (r,c) = size mat
    (r',c') = (r*stride, c*stride)

-- a slightly faster way to pading the matrix
-- camparing to fromBlocks provided by hmatrix.
zeroPadded :: Int -> Matrix Float -> Matrix Float
zeroPadded p mat = runSTMatrix $ do
  mat' <- newMatrix' 0 r' c'
  setMatrix mat' p p mat
  return mat'
  where
    (r,c) = size mat
    (r',c') = (r+2*p,c+2*p)

-- a slightly faster version of newMatrix, which based
-- directly on lower level Vector.Storable creation.
newMatrix' :: SVM.Storable t => t -> Int -> Int -> ST s (STMatrix s t)
newMatrix' v r c = do
  vec <- SVM.replicate (r*c) v
  vec <- SV.unsafeFreeze vec
  unsafeThawMatrix $ reshape c vec