{-# LANGUAGE BangPatterns #-} -------------------------------------------------------------------------------- -- | -- Module : Foreign.CUDA.Analysis.Occupancy -- Copyright : [2009..2017] Trevor L. McDonell -- License : BSD -- -- Occupancy calculations for CUDA kernels -- -- <http://developer.download.nvidia.com/compute/cuda/3_0/sdk/docs/CUDA_Occupancy_calculator.xls> -- -- /Determining Registers Per Thread and Shared Memory Per Block/ -- -- To determine the number of registers used per thread in your kernel, simply -- compile the kernel code using the option -- -- > --ptxas-options=-v -- -- to nvcc. This will output information about register, local memory, shared -- memory, and constant memory usage for each kernel in the @.cu@ file. -- Alternatively, you can compile with the @-cubin@ option to nvcc. This will -- generate a @.cubin@ file, which you can open in a text editor. Look for the -- @code@ section with your kernel's name. Within the curly braces (@{ ... }@) -- for that code block, you will see a line with @reg = X@, where @x@ is the -- number of registers used by your kernel. You can also see the amount of -- shared memory used as @smem = Y@. However, if your kernel declares any -- external shared memory that is allocated dynamically, you will need to add -- the number in the @.cubin@ file to the amount you dynamically allocate at run -- time to get the correct shared memory usage. -- -- /Notes About Occupancy/ -- -- Higher occupancy does not necessarily mean higher performance. If a kernel -- is not bandwidth bound, then increasing occupancy will not necessarily -- increase performance. If a kernel invocation is already running at least one -- thread block per multiprocessor in the GPU, and it is bottlenecked by -- computation and not by global memory accesses, then increasing occupancy may -- have no effect. In fact, making changes just to increase occupancy can have -- other effects, such as additional instructions, spills to local memory (which -- is off chip), divergent branches, etc. As with any optimization, you should -- experiment to see how changes affect the *wall clock time* of the kernel -- execution. For bandwidth bound applications, on the other hand, increasing -- occupancy can help better hide the latency of memory accesses, and therefore -- improve performance. -- -------------------------------------------------------------------------------- module Foreign.CUDA.Analysis.Occupancy ( Occupancy(..), occupancy, optimalBlockSize, optimalBlockSizeOf, maxResidentBlocks, incPow2, incWarp, decPow2, decWarp ) where import Data.Ord import Data.List import Foreign.CUDA.Analysis.Device -- GPU Occupancy per multiprocessor -- data Occupancy = Occupancy { activeThreads :: !Int, -- ^ Active threads per multiprocessor activeThreadBlocks :: !Int, -- ^ Active thread blocks per multiprocessor activeWarps :: !Int, -- ^ Active warps per multiprocessor occupancy100 :: !Double -- ^ Occupancy of each multiprocessor (percent) } deriving (Eq, Ord, Show) -- | -- Calculate occupancy data for a given GPU and kernel resource usage -- {-# INLINEABLE occupancy #-} occupancy :: DeviceProperties -- ^ Properties of the card in question -> Int -- ^ Threads per block -> Int -- ^ Registers per thread -> Int -- ^ Shared memory per block (bytes) -> Occupancy occupancy !dev !threads !regs !smem = Occupancy at ab aw oc where at = ab * threads aw = ab * limitWarpsPerBlock ab = minimum [limitDueToWarps, limitDueToRegs, limitDueToSMem] oc = 100 * fromIntegral aw / fromIntegral (warpsPerMP gpu) ceiling' = ceiling :: Double -> Int floor' = floor :: Double -> Int ceilingBy x s = s * ceiling' (fromIntegral x / fromIntegral s) floorBy x s = s * floor' (fromIntegral x / fromIntegral s) -- Physical resources -- gpu = deviceResources dev -- Allocated resource limits -- limitWarpsPerBlock = ceiling' (fromIntegral threads / fromIntegral (threadsPerWarp gpu)) -- limitWarpsPerSM = warpsPerMP gpu limitRegsPerBlock = case regAllocationStyle gpu of Block -> ((limitWarpsPerBlock `ceilingBy` warpAllocUnit gpu) * regs * threadsPerWarp gpu) `ceilingBy` regAllocUnit gpu Warp -> limitWarpsPerBlock limitRegsPerSM = case regAllocationStyle gpu of Block -> maxRegPerBlock gpu Warp -> (maxRegPerBlock gpu `div` ((regs * threadsPerWarp gpu) `ceilingBy` warpRegAllocUnit gpu)) `floorBy` warpAllocUnit gpu limitSMemPerBlock = smem `ceilingBy` sharedMemAllocUnit gpu -- limitSMemPerSM = maxSharedMemPerBlock gpu -- Maximum thread blocks per multiprocessor -- limitDueToWarps = threadBlocksPerMP gpu `min` (warpsPerMP gpu `div` limitWarpsPerBlock) limitDueToRegs | regs > maxRegPerThread gpu = 0 | regs > 0 = (limitRegsPerSM `div` limitRegsPerBlock) * (regFileSizePerMP gpu `div` maxRegPerBlock gpu) | otherwise = threadBlocksPerMP gpu limitDueToSMem | smem > maxSharedMemPerBlock gpu = 0 | smem > 0 = sharedMemPerMP gpu `div` limitSMemPerBlock | otherwise = threadBlocksPerMP gpu -- | -- Optimise multiprocessor occupancy as a function of thread block size and -- resource usage. This returns the smallest satisfying block size in increments -- of a single warp. -- {-# INLINEABLE optimalBlockSize #-} optimalBlockSize :: DeviceProperties -- ^ Architecture to optimise for -> (Int -> Int) -- ^ Register count as a function of thread block size -> (Int -> Int) -- ^ Shared memory usage (bytes) as a function of thread block size -> (Int, Occupancy) optimalBlockSize dev = optimalBlockSizeOf dev (decWarp dev) -- | -- As 'optimalBlockSize', but with a generator that produces the specific thread -- block sizes that should be tested. The generated list can produce values in -- any order, but the last satisfying block size will be returned. Hence, values -- should be monotonically decreasing to return the smallest block size yielding -- maximum occupancy, and vice-versa. -- {-# INLINEABLE optimalBlockSizeOf #-} optimalBlockSizeOf :: DeviceProperties -- ^ Architecture to optimise for -> [Int] -- ^ Thread block sizes to consider -> (Int -> Int) -- ^ Register count as a function of thread block size -> (Int -> Int) -- ^ Shared memory usage (bytes) as a function of thread block size -> (Int, Occupancy) optimalBlockSizeOf !dev !threads !freg !fsmem = maximumBy (comparing (occupancy100 . snd)) $ [ (t, occupancy dev t (freg t) (fsmem t)) | t <- threads ] -- | Increments in powers-of-two, over the range of supported thread block sizes -- for the given device. -- {-# INLINEABLE incPow2 #-} incPow2 :: DeviceProperties -> [Int] incPow2 !dev = map ((2::Int)^) [lb, lb+1 .. ub] where round' = round :: Double -> Int lb = round' . logBase 2 . fromIntegral $ warpSize dev ub = round' . logBase 2 . fromIntegral $ maxThreadsPerBlock dev -- | Decrements in powers-of-two, over the range of supported thread block sizes -- for the given device. -- {-# INLINEABLE decPow2 #-} decPow2 :: DeviceProperties -> [Int] decPow2 !dev = map ((2::Int)^) [ub, ub-1 .. lb] where round' = round :: Double -> Int lb = round' . logBase 2 . fromIntegral $ warpSize dev ub = round' . logBase 2 . fromIntegral $ maxThreadsPerBlock dev -- | Decrements in the warp size of the device, over the range of supported -- thread block sizes. -- {-# INLINEABLE decWarp #-} decWarp :: DeviceProperties -> [Int] decWarp !dev = [block, block-warp .. warp] where !warp = warpSize dev !block = maxThreadsPerBlock dev -- | Increments in the warp size of the device, over the range of supported -- thread block sizes. -- {-# INLINEABLE incWarp #-} incWarp :: DeviceProperties -> [Int] incWarp !dev = [warp, 2*warp .. block] where warp = warpSize dev block = maxThreadsPerBlock dev -- | -- Determine the maximum number of CTAs that can be run simultaneously for a -- given kernel / device combination. -- {-# INLINEABLE maxResidentBlocks #-} maxResidentBlocks :: DeviceProperties -- ^ Properties of the card in question -> Int -- ^ Threads per block -> Int -- ^ Registers per thread -> Int -- ^ Shared memory per block (bytes) -> Int -- ^ Maximum number of resident blocks maxResidentBlocks !dev !thds !regs !smem = multiProcessorCount dev * activeThreadBlocks (occupancy dev thds regs smem)