{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module Data.Array.Accelerate.LLVM.PTX.Analysis.Launch (
DeviceProperties, Occupancy, LaunchConfig,
simpleLaunchConfig, launchConfig,
multipleOf, multipleOfQ,
) where
import Foreign.CUDA.Analysis as CUDA
import Language.Haskell.TH
type LaunchConfig
= Int
-> Int
-> Int
-> ( Occupancy
, Int
, Int -> Int
, Int
, Q (TExp (Int -> Int))
)
simpleLaunchConfig :: DeviceProperties -> LaunchConfig
simpleLaunchConfig dev = launchConfig dev (decWarp dev) (const 0) multipleOf multipleOfQ
launchConfig
:: DeviceProperties
-> [Int]
-> (Int -> Int)
-> (Int -> Int -> Int)
-> Q (TExp (Int -> Int -> Int))
-> LaunchConfig
launchConfig dev candidates dynamic_smem grid_size grid_sizeQ maxThreads registers static_smem =
let
(cta, occ) = optimalBlockSizeOf dev (filter (<= maxThreads) candidates) (const registers) smem
maxGrid = multiProcessorCount dev * activeThreadBlocks occ
grid n = maxGrid `min` grid_size n cta
smem n = static_smem + dynamic_smem n
gridQ = [|| \n -> (maxGrid::Int) `min` $$grid_sizeQ (n::Int) (cta::Int) ||]
in
( occ, cta, grid, dynamic_smem cta, gridQ )
multipleOf :: Int -> Int -> Int
multipleOf x y = ((x + y - 1) `quot` y)
multipleOfQ :: Q (TExp (Int -> Int -> Int))
multipleOfQ = [|| multipleOf ||]