{-# LANGUAGE CPP                 #-}
{-# LANGUAGE GADTs               #-}
{-# LANGUAGE OverloadedStrings   #-}
{-# LANGUAGE RecordWildCards     #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications    #-}
{-# LANGUAGE ViewPatterns        #-}
-- |
-- Module      : Data.Array.Accelerate.LLVM.PTX.CodeGen.Permute
-- Copyright   : [2016..2020] The Accelerate Team
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.PTX.CodeGen.Permute (

  mkPermute,

) where

import Data.Array.Accelerate.AST
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Representation.Array
import Data.Array.Accelerate.Representation.Elt
import Data.Array.Accelerate.Representation.Shape
import Data.Array.Accelerate.Representation.Type

import Data.Array.Accelerate.LLVM.CodeGen.Arithmetic                as A
import Data.Array.Accelerate.LLVM.CodeGen.Array
import Data.Array.Accelerate.LLVM.CodeGen.Base
import Data.Array.Accelerate.LLVM.CodeGen.Constant
import Data.Array.Accelerate.LLVM.CodeGen.Environment
import Data.Array.Accelerate.LLVM.CodeGen.Exp
import Data.Array.Accelerate.LLVM.CodeGen.IR
import Data.Array.Accelerate.LLVM.CodeGen.Monad
import Data.Array.Accelerate.LLVM.CodeGen.Permute
import Data.Array.Accelerate.LLVM.CodeGen.Ptr
import Data.Array.Accelerate.LLVM.CodeGen.Sugar

import Data.Array.Accelerate.LLVM.PTX.CodeGen.Base
import Data.Array.Accelerate.LLVM.PTX.CodeGen.Loop
import Data.Array.Accelerate.LLVM.PTX.Target

import LLVM.AST.Type.AddrSpace
import LLVM.AST.Type.Instruction
import LLVM.AST.Type.Instruction.Atomic
import LLVM.AST.Type.Instruction.RMW                                as RMW
import LLVM.AST.Type.Instruction.Volatile
import LLVM.AST.Type.Operand
import LLVM.AST.Type.Representation

import Foreign.CUDA.Analysis

import Control.Monad                                                ( void )
import Control.Monad.State                                          ( gets )
import Prelude


-- Forward permutation specified by an indexing mapping. The resulting array is
-- initialised with the given defaults, and any further values that are permuted
-- into the result array are added to the current value using the combination
-- function.
--
-- The combination function must be /associative/ and /commutative/. Elements
-- that are mapped to the magic index 'ignore' are dropped.
--
-- Parallel forward permutation has to take special care because different
-- threads could concurrently try to update the same memory location. Where
-- available we make use of special atomic instructions and other optimisations,
-- but in the general case each element of the output array has a lock which
-- must be obtained by the thread before it can update that memory location.
--
-- TODO: After too many failures to acquire the lock on an element, the thread
-- should back off and try a different element, adding this failed element to
-- a queue or some such.
--
mkPermute
    :: HasCallStack
    => Gamma            aenv
    -> ArrayR (Array sh e)
    -> ShapeR sh'
    -> IRPermuteFun PTX aenv (e -> e -> e)
    -> IRFun1       PTX aenv (sh -> PrimMaybe sh')
    -> MIRDelayed   PTX aenv (Array sh e)
    -> CodeGen      PTX      (IROpenAcc PTX aenv (Array sh' e))
mkPermute aenv repr shr' IRPermuteFun{..} project arr =
  case atomicRMW of
    Just (rmw, f) -> mkPermute_rmw   aenv repr shr' rmw f   project arr
    _             -> mkPermute_mutex aenv repr shr' combine project arr


-- Parallel forward permutation function which uses atomic instructions to
-- implement lock-free array updates.
--
-- Atomic instruction support on CUDA devices is a bit patchy, so depending on
-- the element type and compute capability of the target hardware we may need to
-- emulate the operation using atomic compare-and-swap.
--
--              Int32    Int64    Float16    Float32    Float64
--           +-------------------------------------------------
--    (+)    |  2.0       2.0       7.0        2.0        6.0
--    (-)    |  2.0       2.0        x          x          x
--    (.&.)  |  2.0       3.2
--    (.|.)  |  2.0       3.2
--    xor    |  2.0       3.2
--    min    |  2.0       3.2        x          x          x
--    max    |  2.0       3.2        x          x          x
--    CAS    |  2.0       2.0
--
-- Note that NVPTX requires at least compute 2.0, so we can always implement the
-- lockfree update operations in terms of compare-and-swap.
--
mkPermute_rmw
    :: HasCallStack
    => Gamma aenv
    -> ArrayR (Array sh e)
    -> ShapeR sh'
    -> RMWOperation
    -> IRFun1     PTX aenv (e -> e)
    -> IRFun1     PTX aenv (sh -> PrimMaybe sh')
    -> MIRDelayed PTX aenv (Array sh e)
    -> CodeGen    PTX      (IROpenAcc PTX aenv (Array sh' e))
mkPermute_rmw aenv (ArrayR shr tp) shr' rmw update project marr = do
  dev <- liftCodeGen $ gets ptxDeviceProperties
  --
  let
      outR                = ArrayR shr' tp
      (arrOut, paramOut)  = mutableArray outR "out"
      (arrIn,  paramIn)   = delayedArray "in" marr
      paramEnv            = envParam aenv
      start               = liftInt 0
      --
      bytes               = bytesElt tp
      compute             = computeCapability dev
      compute32           = Compute 3 2
      compute60           = Compute 6 0
      compute70           = Compute 7 0
  --
  makeOpenAcc "permute_rmw" (paramOut ++ paramIn ++ paramEnv) $ do

    shIn  <- delayedExtent arrIn
    end   <- shapeSize shr shIn

    imapFromTo start end $ \i -> do

      ix  <- indexOfInt shr shIn i
      ix' <- app1 project ix

      when (isJust ix') $ do
        j <- intOfIndex shr' (irArrayShape arrOut) =<< fromJust ix'
        x <- app1 (delayedLinearIndex arrIn) i
        r <- app1 update x

        case rmw of
          Exchange
            -> writeArray TypeInt arrOut j r
          --
          _ | TupRsingle (SingleScalarType s)   <- tp
            , adata                             <- irArrayData arrOut
            -> do
                  addr <- instr' $ GetElementPtr (asPtr defaultAddrSpace (op s adata)) [op integralType j]
                  --
                  let
                      rmw_integral :: IntegralType t -> Operand (Ptr t) -> Operand t -> CodeGen PTX ()
                      rmw_integral t ptr val
                        | primOk    = void . instr' $ AtomicRMW (IntegralNumType t) NonVolatile rmw ptr val (CrossThread, AcquireRelease)
                        | otherwise =
                            case rmw of
                              RMW.And -> atomicCAS_rmw s' (A.band t (ir t val)) ptr
                              RMW.Or  -> atomicCAS_rmw s' (A.bor  t (ir t val)) ptr
                              RMW.Xor -> atomicCAS_rmw s' (A.xor  t (ir t val)) ptr
                              RMW.Min -> atomicCAS_cmp s' A.lt ptr val
                              RMW.Max -> atomicCAS_cmp s' A.gt ptr val
                              _       -> internalError "unexpected transition"
                        where
                          s'      = NumSingleType (IntegralNumType t)
                          primOk  = compute >= compute32
                                 || bytes == 4
                                 || case rmw of
                                      RMW.Add -> True
                                      RMW.Sub -> True
                                      _       -> False

                      rmw_floating :: FloatingType t -> Operand (Ptr t) -> Operand t -> CodeGen PTX ()
                      rmw_floating t ptr val =
                        case rmw of
                          RMW.Min       -> atomicCAS_cmp s' A.lt ptr val
                          RMW.Max       -> atomicCAS_cmp s' A.gt ptr val
                          RMW.Sub       -> atomicCAS_rmw s' (A.sub n (ir t val)) ptr
                          RMW.Add
                            | primAdd   -> atomicAdd_f t ptr val
                            | otherwise -> atomicCAS_rmw s' (A.add n (ir t val)) ptr
                          _             -> internalError "unexpected transition"
                        where
                          n       = FloatingNumType t
                          s'      = NumSingleType n
                          primAdd =
                            case t of
                              TypeHalf   -> compute >= compute70
                              TypeFloat  -> True
                              TypeDouble -> compute >= compute60
                  case s of
                    NumSingleType (IntegralNumType t) -> rmw_integral t addr (op t r)
                    NumSingleType (FloatingNumType t) -> rmw_floating t addr (op t r)
          --
          _ -> internalError "unexpected transition"

    return_


-- Parallel forward permutation function which uses a spinlock to acquire
-- a mutex before updating the value at that location.
--
mkPermute_mutex
    :: Gamma          aenv
    -> ArrayR (Array sh e)
    -> ShapeR sh'
    -> IRFun2     PTX aenv (e -> e -> e)
    -> IRFun1     PTX aenv (sh -> PrimMaybe sh')
    -> MIRDelayed PTX aenv (Array sh e)
    -> CodeGen    PTX      (IROpenAcc PTX aenv (Array sh' e))
mkPermute_mutex aenv (ArrayR shr tp) shr' combine project marr =
  let
      outR                  = ArrayR shr' tp
      lockR                 = ArrayR (ShapeRsnoc ShapeRz) (TupRsingle scalarTypeWord32)
      (arrOut,  paramOut)   = mutableArray outR "out"
      (arrLock, paramLock)  = mutableArray lockR "lock"
      (arrIn,   paramIn)    = delayedArray "in" marr
      paramEnv              = envParam aenv
      start                 = liftInt 0
  in
  makeOpenAcc "permute_mutex" (paramOut ++ paramLock ++ paramIn ++ paramEnv) $ do

    shIn  <- delayedExtent arrIn
    end   <- shapeSize shr shIn

    imapFromTo start end $ \i -> do

      ix  <- indexOfInt shr shIn i
      ix' <- app1 project ix

      -- project element onto the destination array and (atomically) update
      when (isJust ix') $ do
        j <- intOfIndex shr' (irArrayShape arrOut) =<< fromJust ix'
        x <- app1 (delayedLinearIndex arrIn) i

        atomically arrLock j $ do
          y <- readArray TypeInt arrOut j
          r <- app2 combine x y
          writeArray TypeInt arrOut j r

    return_


-- Atomically execute the critical section only when the lock at the given
-- array indexed is obtained.
--
atomically
    :: IRArray (Vector Word32)
    -> Operands Int
    -> CodeGen PTX a
    -> CodeGen PTX a
atomically barriers i action = do
  dev <- liftCodeGen $ gets ptxDeviceProperties
  if computeCapability dev >= Compute 7 0
     then atomically_thread barriers i action
     else atomically_warp   barriers i action


-- Atomically execute the critical section only when the lock at the given
-- array index is obtained. The thread spins waiting for the lock to be
-- released with exponential backoff on failure in case the lock is
-- contended.
--
-- > uint32_t ns = 8;
-- > while ( atomic_exchange(&lock[i], 1) == 1 ) {
-- >     __nanosleep(ns);
-- >     if ( ns < 256 ) {
-- >         ns *= 2;
-- >     }
-- > }
--
-- Requires independent thread scheduling features of SM7+.
--
atomically_thread
    :: IRArray (Vector Word32)
    -> Operands Int
    -> CodeGen PTX a
    -> CodeGen PTX a
atomically_thread barriers i action = do
  let
      lock    = integral integralType 1
      unlock  = integral integralType 0
      unlock' = ir TypeWord32 unlock
      i32     = TupRsingle scalarTypeInt32
  --
  entry <- newBlock "spinlock.entry"
  sleep <- newBlock "spinlock.backoff"
  moar  <- newBlock "spinlock.backoff-moar"
  start <- newBlock "spinlock.critical-start"
  end   <- newBlock "spinlock.critical-end"
  exit  <- newBlock "spinlock.exit"
  ns    <- fresh i32

  addr  <- instr' $ GetElementPtr (asPtr defaultAddrSpace (op integralType (irArrayData barriers))) [op integralType i]
  top   <- br entry

  -- Loop until this thread has completed its critical section. If the slot
  -- was unlocked we just acquired the lock and the thread can perform its
  -- critical section, otherwise sleep the thread and try again later.
  setBlock entry
  old   <- instr $ AtomicRMW numType NonVolatile Exchange addr lock (CrossThread, Acquire)
  ok    <- A.eq singleType old unlock'
  _     <- cbr ok start sleep

  -- We did not acquire the lock. Sleep the thread for a small amount of
  -- time and (possibly) increase the sleep duration for the next round
  setBlock sleep
  _     <- nanosleep ns
  p     <- A.lt singleType ns (ir TypeInt32 (integral integralType 256))
  _     <- cbr p moar entry

  setBlock moar
  ns'   <- A.mul numType ns (ir TypeInt32 (integral integralType 2))
  _     <- phi' i32 entry ns [(ir TypeInt32 (integral (integralType) 8), top), (ns, sleep), (ns', moar)]
  _     <- br entry

  -- If we just acquired the lock, execute the critical section, then
  -- release the lock and continue with your day.
  setBlock start
  r     <- action
  _     <- br end

  setBlock end
  _     <- instr $ AtomicRMW numType NonVolatile Exchange addr unlock (CrossThread, AcquireRelease)
  _     <- __threadfence_grid   -- TODO: why is this required?
  _     <- br exit

  setBlock exit
  return r


-- Atomically execute the critical section only when the lock at the given array
-- index is obtained. The thread spins waiting for the lock to be released and
-- there is no backoff strategy in case the lock is contended.
--
-- The canonical implementation of a spin-lock looks like this:
--
-- > do {
-- >   old = atomic_exchange(&lock[i], 1);
-- > } while (old == 1);
-- >
-- > /* critical section */
-- >
-- > atomic_exchange(&lock[i], 0);
--
-- The initial loop repeatedly attempts to take the lock by writing a 1 (locked)
-- into the lock slot. Once the 'old' state of the lock returns 0 (unlocked),
-- then we just acquired the lock and the atomic section can be computed.
-- Finally, the lock is released by writing 0 back to the lock slot.
--
-- However, there is a complication with CUDA devices because all threads in
-- a warp must execute in lockstep (with predicated execution). In the above
-- setup, once a thread acquires a lock, then it will be disabled and stop
-- participating in the loop, waiting for all other threads (to acquire their
-- locks) before continuing program execution. If two threads in the same warp
-- attempt to acquire the same lock, then once the lock is acquired by one
-- thread then it will sit idle waiting while the second thread spins attempting
-- to grab a lock that will never be released because the first thread (which
-- holds the lock) can not make progress. DEADLOCK.
--
-- To prevent this situation we must invert the algorithm so that threads can
-- always make progress, until each warp in the thread has committed their
-- result.
--
-- > done = 0;
-- > do {
-- >   if ( atomic_exchange(&lock[i], 1) == 0 ) {
-- >
-- >     /* critical section */
-- >
-- >     done = 1;
-- >     atomic_exchange(&lock[i], 0);
-- >   }
-- > } while ( done == 0 );
--
atomically_warp
    :: IRArray (Vector Word32)
    -> Operands Int
    -> CodeGen PTX a
    -> CodeGen PTX a
atomically_warp barriers i action = do
  let
      lock    = integral integralType 1
      unlock  = integral integralType 0
      unlock' = ir TypeWord32 unlock
  --
  entry <- newBlock "spinlock.entry"
  start <- newBlock "spinlock.critical-start"
  end   <- newBlock "spinlock.critical-end"
  exit  <- newBlock "spinlock.exit"

  addr <- instr' $ GetElementPtr (asPtr defaultAddrSpace (op integralType (irArrayData barriers))) [op integralType i]
  _    <- br entry

  -- Loop until this thread has completed its critical section. If the slot was
  -- unlocked then we just acquired the lock and the thread can perform the
  -- critical section, otherwise skip to the bottom of the critical section.
  setBlock entry
  old  <- instr $ AtomicRMW numType NonVolatile Exchange addr lock   (CrossThread, Acquire)
  ok   <- A.eq singleType old unlock'
  no   <- cbr ok start end

  -- If we just acquired the lock, execute the critical section
  setBlock start
  r    <- action
  _    <- instr $ AtomicRMW numType NonVolatile Exchange addr unlock (CrossThread, AcquireRelease)
  yes  <- br end

  -- At the base of the critical section, threads participate in a memory fence
  -- to ensure the lock state is committed to memory. Depending on which
  -- incoming edge the thread arrived at this block from determines whether they
  -- have completed their critical section.
  setBlock end
  res  <- freshName
  done <- phi1 end res [(boolean True, yes), (boolean False, no)]

  __syncthreads
  _    <- cbr (OP_Bool done) exit entry

  setBlock exit
  return r