{-# LANGUAGE GADTs #-} -- | -- Module : Data.Array.Accelerate.CUDA.Analysis.Launch -- Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Manuel M T Chakravarty -- Stability : experimental -- Portability : non-partable (GHC extensions) -- module Data.Array.Accelerate.CUDA.Analysis.Launch (launchConfig) where import Control.Monad.IO.Class import Data.Array.Accelerate.AST import Data.Array.Accelerate.Analysis.Type import Data.Array.Accelerate.CUDA.State import qualified Foreign.CUDA.Analysis as CUDA import qualified Foreign.CUDA.Driver as CUDA -- | -- Determine kernel launch parameters for the given array computation (as well -- as compiled function module). This consists of the thread block size, number -- of blocks, and dynamically allocated shared memory (bytes), respectively. -- -- By default, this launches the kernel with the minimum block size that gives -- maximum occupancy, and the grid size limited to the maximum number of -- physically resident blocks. Hence, kernels may need to process multiple -- elements per thread. -- -- TLM: this could probably be stored in the KernelEntry -- launchConfig :: OpenAcc aenv a -> Int -> CUDA.Fun -> CIO (Int, Int, Integer) launchConfig acc n fn = do regs <- liftIO $ CUDA.requires fn CUDA.NumRegs stat <- liftIO $ CUDA.requires fn CUDA.SharedSizeBytes -- static memory only prop <- getM deviceProps let dyn = sharedMem acc (cta, occ) = CUDA.optimalBlockSize prop (const regs) ((stat+) . dyn) mbk = CUDA.multiProcessorCount prop * CUDA.activeThreadBlocks occ return (cta, mbk `min` gridSize acc n cta, toInteger (dyn cta)) -- | -- Determine the number of blocks of the given size necessary to process the -- given array expression. This should understand things like #elements per -- thread for the various kernels. -- gridSize :: OpenAcc aenv a -> Int -> Int -> Int gridSize acc size cta = let between arr n = (n+arr-1) `div` n in 1 `max` ((cta - 1 + (size `between` elementsPerThread acc)) `div` cta) elementsPerThread :: OpenAcc aenv a -> Int elementsPerThread _ = 1 -- | -- Analyse the given array expression, returning an estimate of dynamic shared -- memory usage as a function of thread block size. This can be used by the -- occupancy calculator to optimise kernel launch shape. -- sharedMem :: OpenAcc aenv a -> Int -> Int sharedMem (Fold _ x _) t = sizeOf (expType x) * t sharedMem (Scanl _ x _) t = sizeOf (expType x) * t sharedMem (Scanr _ x _) t = sizeOf (expType x) * t sharedMem _ _ = 0