-- GENERATED by C->Haskell Compiler, version 0.28.2 Switcheroo, 1 April 2016 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "src/Foreign/CUDA/Driver/Exec.chs" #-}
{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE EmptyDataDecls           #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE GADTs                    #-}
{-# LANGUAGE TemplateHaskell          #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Driver.Exec
-- Copyright : [2009..2017] Trevor L. McDonell
-- License   : BSD
--
-- Kernel execution control for low-level driver interface
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Driver.Exec (

  -- * Kernel Execution
  Fun(Fun), FunParam(..), FunAttribute(..), SharedMem(..),
  requires,
  setCacheConfigFun,
  setSharedMemConfigFun,
  launchKernel, launchKernel',

  -- Deprecated since CUDA-4.0
  setBlockShape, setSharedSize, setParams, launch,

) where
import qualified Foreign.C.Types as C2HSImp
import qualified Foreign.Ptr as C2HSImp





{-# LINE 32 "src/Foreign/CUDA/Driver/Exec.chs" #-}


-- Friends
import Foreign.CUDA.Internal.C2HS
import Foreign.CUDA.Driver.Error
import Foreign.CUDA.Driver.Context                      ( Cache(..), SharedMem(..) )
import Foreign.CUDA.Driver.Stream                       ( Stream(..), defaultStream )

-- System
import Foreign
import Foreign.C
import Data.Maybe
import Control.Monad                                    ( zipWithM_ )


{-# DEPRECATED setBlockShape, setSharedSize, setParams, launch
      "use launchKernel instead" #-}


--------------------------------------------------------------------------------
-- Data Types
--------------------------------------------------------------------------------

-- |
-- A @\_\_global\_\_@ device function
--
newtype Fun = Fun { useFun :: ((C2HSImp.Ptr ()))}


-- |
-- Function attributes
--
data FunAttribute = MaxKernelThreadsPerBlock
                  | SharedSizeBytes
                  | ConstSizeBytes
                  | LocalSizeBytes
                  | NumRegs
                  | PtxVersion
                  | BinaryVersion
                  | CacheModeCa
                  | CU_FUNC_ATTRIBUTE_MAX
  deriving (Eq,Show)
instance Enum FunAttribute where
  succ MaxKernelThreadsPerBlock = SharedSizeBytes
  succ SharedSizeBytes = ConstSizeBytes
  succ ConstSizeBytes = LocalSizeBytes
  succ LocalSizeBytes = NumRegs
  succ NumRegs = PtxVersion
  succ PtxVersion = BinaryVersion
  succ BinaryVersion = CacheModeCa
  succ CacheModeCa = CU_FUNC_ATTRIBUTE_MAX
  succ CU_FUNC_ATTRIBUTE_MAX = error "FunAttribute.succ: CU_FUNC_ATTRIBUTE_MAX has no successor"

  pred SharedSizeBytes = MaxKernelThreadsPerBlock
  pred ConstSizeBytes = SharedSizeBytes
  pred LocalSizeBytes = ConstSizeBytes
  pred NumRegs = LocalSizeBytes
  pred PtxVersion = NumRegs
  pred BinaryVersion = PtxVersion
  pred CacheModeCa = BinaryVersion
  pred CU_FUNC_ATTRIBUTE_MAX = CacheModeCa
  pred MaxKernelThreadsPerBlock = error "FunAttribute.pred: MaxKernelThreadsPerBlock has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from CU_FUNC_ATTRIBUTE_MAX

  fromEnum MaxKernelThreadsPerBlock = 0
  fromEnum SharedSizeBytes = 1
  fromEnum ConstSizeBytes = 2
  fromEnum LocalSizeBytes = 3
  fromEnum NumRegs = 4
  fromEnum PtxVersion = 5
  fromEnum BinaryVersion = 6
  fromEnum CacheModeCa = 7
  fromEnum CU_FUNC_ATTRIBUTE_MAX = 8

  toEnum 0 = MaxKernelThreadsPerBlock
  toEnum 1 = SharedSizeBytes
  toEnum 2 = ConstSizeBytes
  toEnum 3 = LocalSizeBytes
  toEnum 4 = NumRegs
  toEnum 5 = PtxVersion
  toEnum 6 = BinaryVersion
  toEnum 7 = CacheModeCa
  toEnum 8 = CU_FUNC_ATTRIBUTE_MAX
  toEnum unmatched = error ("FunAttribute.toEnum: Cannot match " ++ show unmatched)

{-# LINE 70 "src/Foreign/CUDA/Driver/Exec.chs" #-}


-- |
-- Kernel function parameters
--
data FunParam where
  IArg :: !Int32           -> FunParam
  FArg :: !Float           -> FunParam
  VArg :: Storable a => !a -> FunParam

instance Storable FunParam where
  sizeOf (IArg _)       = sizeOf (undefined :: CUInt)
  sizeOf (FArg _)       = sizeOf (undefined :: CFloat)
  sizeOf (VArg v)       = sizeOf v

  alignment (IArg _)    = alignment (undefined :: CUInt)
  alignment (FArg _)    = alignment (undefined :: CFloat)
  alignment (VArg v)    = alignment v

  poke p (IArg i)       = poke (castPtr p) i
  poke p (FArg f)       = poke (castPtr p) f
  poke p (VArg v)       = poke (castPtr p) v

  peek _                = error "Can not peek Foreign.CUDA.Driver.FunParam"


--------------------------------------------------------------------------------
-- Execution Control
--------------------------------------------------------------------------------

-- |
-- Returns the value of the selected attribute requirement for the given kernel.
--
-- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b>
--
{-# INLINEABLE requires #-}
requires :: Fun -> FunAttribute -> IO Int
requires !fn !att = resultIfOk =<< cuFuncGetAttribute att fn

{-# INLINE cuFuncGetAttribute #-}
cuFuncGetAttribute :: (FunAttribute) -> (Fun) -> IO ((Status), (Int))
cuFuncGetAttribute a2 a3 =
  alloca $ \a1' -> 
  let {a2' = cFromEnum a2} in 
  let {a3' = useFun a3} in 
  cuFuncGetAttribute'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  peekIntConv  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 113 "src/Foreign/CUDA/Driver/Exec.chs" #-}



-- |
-- On devices where the L1 cache and shared memory use the same hardware
-- resources, this sets the preferred cache configuration for the given device
-- function. This is only a preference; the driver is free to choose a different
-- configuration as required to execute the function.
--
-- Switching between configuration modes may insert a device-side
-- synchronisation point for streamed kernel launches.
--
-- Requires CUDA-3.0.
--
-- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681>
--
{-# INLINEABLE setCacheConfigFun #-}
setCacheConfigFun :: Fun -> Cache -> IO ()
setCacheConfigFun !fn !pref = nothingIfOk =<< cuFuncSetCacheConfig fn pref

{-# INLINE cuFuncSetCacheConfig #-}
cuFuncSetCacheConfig :: (Fun) -> (Cache) -> IO ((Status))
cuFuncSetCacheConfig a1 a2 =
  let {a1' = useFun a1} in 
  let {a2' = cFromEnum a2} in 
  cuFuncSetCacheConfig'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 140 "src/Foreign/CUDA/Driver/Exec.chs" #-}



-- |
-- Set the shared memory configuration of a device function.
--
-- On devices with configurable shared memory banks, this will force all
-- subsequent launches of the given device function to use the specified
-- shared memory bank size configuration. On launch of the function, the
-- shared memory configuration of the device will be temporarily changed if
-- needed to suit the function configuration. Changes in shared memory
-- configuration may introduction a device side synchronisation between
-- kernel launches.
--
-- Any per-function configuration specified by 'setSharedMemConfig' will
-- override the context-wide configuration set with
-- 'Foreign.CUDA.Driver.Context.Config.setSharedMem'.
--
-- Changing the shared memory bank size will not increase shared memory
-- usage or affect occupancy of kernels, but may have major effects on
-- performance. Larger bank sizes will allow for greater potential
-- bandwidth to shared memory, but will change what kinds of accesses to
-- shared memory will result in bank conflicts.
--
-- This function will do nothing on devices with fixed shared memory bank
-- size.
--
-- Requires CUDA-5.0.
--
-- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g430b913f24970e63869635395df6d9f5>
--
{-# INLINEABLE setSharedMemConfigFun #-}
setSharedMemConfigFun :: Fun -> SharedMem -> IO ()
setSharedMemConfigFun !fun !pref = nothingIfOk =<< cuFuncSetSharedMemConfig fun pref

{-# INLINE cuFuncSetSharedMemConfig #-}
cuFuncSetSharedMemConfig :: (Fun) -> (SharedMem) -> IO ((Status))
cuFuncSetSharedMemConfig a1 a2 =
  let {a1' = useFun a1} in 
  let {a2' = cFromEnum a2} in 
  cuFuncSetSharedMemConfig'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 184 "src/Foreign/CUDA/Driver/Exec.chs" #-}



-- |
-- Invoke a kernel on a @(gx * gy * gz)@ grid of blocks, where each block
-- contains @(tx * ty * tz)@ threads and has access to a given number of bytes
-- of shared memory. The launch may also be associated with a specific 'Stream'.
--
-- In 'launchKernel', the number of kernel parameters and their offsets and
-- sizes do not need to be specified, as this information is retrieved directly
-- from the kernel's image. This requires the kernel to have been compiled with
-- toolchain version 3.2 or later.
--
-- The alternative 'launchKernel'' will pass the arguments in directly,
-- requiring the application to know the size and alignment/padding of each
-- kernel parameter.
--
-- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15>
--
{-# INLINEABLE launchKernel  #-}
{-# INLINEABLE launchKernel' #-}
launchKernel, launchKernel'
    :: Fun                      -- ^ function to execute
    -> (Int,Int,Int)            -- ^ block grid dimension
    -> (Int,Int,Int)            -- ^ thread block shape
    -> Int                      -- ^ shared memory (bytes)
    -> Maybe Stream             -- ^ (optional) stream to execute in
    -> [FunParam]               -- ^ list of function parameters
    -> IO ()
launchKernel !fn (!gx,!gy,!gz) (!tx,!ty,!tz) !sm !mst !args
  = (=<<) nothingIfOk
  $ withMany withFP args
  $ \pa -> withArray pa
  $ \pp -> cuLaunchKernel fn gx gy gz tx ty tz sm st pp nullPtr
  where
    !st = fromMaybe defaultStream mst

    withFP :: FunParam -> (Ptr FunParam -> IO b) -> IO b
    withFP !p !f = case p of
      IArg v -> with' v (f . castPtr)
      FArg v -> with' v (f . castPtr)
      VArg v -> with' v (f . castPtr)

    -- can't use the standard 'with' because 'alloca' will pass an undefined
    -- dummy argument when determining 'sizeOf' and 'alignment', but sometimes
    -- instances in Accelerate need to evaluate this argument.
    --
    with' :: Storable a => a -> (Ptr a -> IO b) -> IO b
    with' !val !f =
      allocaBytes (sizeOf val) $ \ptr -> do
        poke ptr val
        f ptr


launchKernel' !fn (!gx,!gy,!gz) (!tx,!ty,!tz) !sm !mst !args
  = (=<<) nothingIfOk
  $ with bytes
  $ \pb -> withArray' args
  $ \pa -> withArray0 nullPtr [buffer, castPtr pa, size, castPtr pb]
  $ \pp -> cuLaunchKernel fn gx gy gz tx ty tz sm st nullPtr pp
  where
    buffer      = wordPtrToPtr 0x01     -- CU_LAUNCH_PARAM_BUFFER_POINTER
    size        = wordPtrToPtr 0x02     -- CU_LAUNCH_PARAM_BUFFER_SIZE
    bytes       = foldl (\a x -> a + sizeOf x) 0 args
    st          = fromMaybe defaultStream mst

    -- can't use the standard 'withArray' because 'mallocArray' will pass
    -- 'undefined' to 'sizeOf' when determining how many bytes to allocate, but
    -- our Storable instance for FunParam needs to dispatch on each constructor,
    -- hence evaluating the undefined.
    --
    withArray' !vals !f =
      allocaBytes bytes $ \ptr -> do
        pokeArray ptr vals
        f ptr


{-# INLINE cuLaunchKernel #-}
cuLaunchKernel :: (Fun) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Stream) -> (Ptr (Ptr FunParam)) -> (Ptr (Ptr ())) -> IO ((Status))
cuLaunchKernel a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = fromIntegral a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = useStream a9} in 
  let {a10' = castPtr a10} in 
  let {a11' = castPtr a11} in 
  cuLaunchKernel'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' a11' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 270 "src/Foreign/CUDA/Driver/Exec.chs" #-}




--------------------------------------------------------------------------------
-- Deprecated
--------------------------------------------------------------------------------

-- |
-- Invoke the kernel on a size @(w,h)@ grid of blocks. Each block contains the
-- number of threads specified by a previous call to 'setBlockShape'. The launch
-- may also be associated with a specific 'Stream'.
--
{-# INLINEABLE launch #-}
launch :: Fun -> (Int,Int) -> Maybe Stream -> IO ()
launch !fn (!w,!h) mst =
  nothingIfOk =<< cuLaunchGridAsync fn w h (fromMaybe defaultStream mst)

{-# INLINE cuLaunchGridAsync #-}
cuLaunchGridAsync :: (Fun) -> (Int) -> (Int) -> (Stream) -> IO ((Status))
cuLaunchGridAsync a1 a2 a3 a4 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuLaunchGridAsync'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 302 "src/Foreign/CUDA/Driver/Exec.chs" #-}



-- |
-- Specify the @(x,y,z)@ dimensions of the thread blocks that are created when
-- the given kernel function is launched.
--
{-# INLINEABLE setBlockShape #-}
setBlockShape :: Fun -> (Int,Int,Int) -> IO ()
setBlockShape !fn (!x,!y,!z) = nothingIfOk =<< cuFuncSetBlockShape fn x y z

{-# INLINE cuFuncSetBlockShape #-}
cuFuncSetBlockShape :: (Fun) -> (Int) -> (Int) -> (Int) -> IO ((Status))
cuFuncSetBlockShape a1 a2 a3 a4 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  cuFuncSetBlockShape'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 318 "src/Foreign/CUDA/Driver/Exec.chs" #-}



-- |
-- Set the number of bytes of dynamic shared memory to be available to each
-- thread block when the function is launched
--
{-# INLINEABLE setSharedSize #-}
setSharedSize :: Fun -> Integer -> IO ()
setSharedSize !fn !bytes = nothingIfOk =<< cuFuncSetSharedSize fn bytes

{-# INLINE cuFuncSetSharedSize #-}
cuFuncSetSharedSize :: (Fun) -> (Integer) -> IO ((Status))
cuFuncSetSharedSize a1 a2 =
  let {a1' = useFun a1} in 
  let {a2' = cIntConv a2} in 
  cuFuncSetSharedSize'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 332 "src/Foreign/CUDA/Driver/Exec.chs" #-}



-- |
-- Set the parameters that will specified next time the kernel is invoked
--
{-# INLINEABLE setParams #-}
setParams :: Fun -> [FunParam] -> IO ()
setParams !fn !prs = do
  zipWithM_ (set fn) offsets prs
  nothingIfOk =<< cuParamSetSize fn (last offsets)
  where
    offsets = scanl (\a b -> a + size b) 0 prs

    size (IArg _)    = sizeOf (undefined :: CUInt)
    size (FArg _)    = sizeOf (undefined :: CFloat)
    size (VArg v)    = sizeOf v

    set f o (IArg v) = nothingIfOk =<< cuParamSeti f o v
    set f o (FArg v) = nothingIfOk =<< cuParamSetf f o v
    set f o (VArg v) = with v $ \p -> (nothingIfOk =<< cuParamSetv f o p (sizeOf v))


{-# INLINE cuParamSetSize #-}
cuParamSetSize :: (Fun) -> (Int) -> IO ((Status))
cuParamSetSize a1 a2 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  cuParamSetSize'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 358 "src/Foreign/CUDA/Driver/Exec.chs" #-}


{-# INLINE cuParamSeti #-}
cuParamSeti :: (Fun) -> (Int) -> (Int32) -> IO ((Status))
cuParamSeti a1 a2 a3 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  cuParamSeti'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 364 "src/Foreign/CUDA/Driver/Exec.chs" #-}


{-# INLINE cuParamSetf #-}
cuParamSetf :: (Fun) -> (Int) -> (Float) -> IO ((Status))
cuParamSetf a1 a2 a3 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = realToFrac a3} in 
  cuParamSetf'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 370 "src/Foreign/CUDA/Driver/Exec.chs" #-}


{-# INLINE cuParamSetv #-}
cuParamSetv :: Storable a => (Fun) -> (Int) -> (Ptr a) -> (Int) -> IO ((Status))
cuParamSetv a1 a2 a3 a4 =
  let {a1' = useFun a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = castPtr a3} in 
  let {a4' = fromIntegral a4} in 
  cuParamSetv'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 378 "src/Foreign/CUDA/Driver/Exec.chs" #-}




foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuFuncGetAttribute"
  cuFuncGetAttribute'_ :: ((C2HSImp.Ptr C2HSImp.CInt) -> (C2HSImp.CInt -> ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuFuncSetCacheConfig"
  cuFuncSetCacheConfig'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuFuncSetSharedMemConfig"
  cuFuncSetSharedMemConfig'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuLaunchKernel"
  cuLaunchKernel'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> ((C2HSImp.Ptr ()) -> ((C2HSImp.Ptr (C2HSImp.Ptr ())) -> ((C2HSImp.Ptr (C2HSImp.Ptr ())) -> (IO C2HSImp.CInt))))))))))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuLaunchGridAsync"
  cuLaunchGridAsync'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (C2HSImp.CInt -> ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt)))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuFuncSetBlockShape"
  cuFuncSetBlockShape'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (C2HSImp.CInt -> (C2HSImp.CInt -> (IO C2HSImp.CInt)))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuFuncSetSharedSize"
  cuFuncSetSharedSize'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CUInt -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuParamSetSize"
  cuParamSetSize'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CUInt -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuParamSeti"
  cuParamSeti'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (C2HSImp.CUInt -> (IO C2HSImp.CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuParamSetf"
  cuParamSetf'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (C2HSImp.CFloat -> (IO C2HSImp.CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Exec.chs.h cuParamSetv"
  cuParamSetv'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> ((C2HSImp.Ptr ()) -> (C2HSImp.CUInt -> (IO C2HSImp.CInt)))))