{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Data.Array.Accelerate.LLVM.PTX.Link (
module Data.Array.Accelerate.LLVM.Link,
ExecutableR(..), FunctionTable(..), Kernel(..), ObjectCode,
withExecutable,
linkFunctionQ,
) where
import Data.Array.Accelerate.Lifetime
import Data.Array.Accelerate.LLVM.Link
import Data.Array.Accelerate.LLVM.State
import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch
import Data.Array.Accelerate.LLVM.PTX.Compile
import Data.Array.Accelerate.LLVM.PTX.Context
import Data.Array.Accelerate.LLVM.PTX.Link.Cache
import Data.Array.Accelerate.LLVM.PTX.Link.Object
import Data.Array.Accelerate.LLVM.PTX.Target
import qualified Data.Array.Accelerate.LLVM.PTX.Debug as Debug
import qualified Foreign.CUDA.Analysis as CUDA
import qualified Foreign.CUDA.Driver as CUDA
import Control.Monad.State
import Data.ByteString.Short.Char8 ( ShortByteString, unpack )
import Foreign.Ptr
import Language.Haskell.TH
import Text.Printf ( printf )
import qualified Data.ByteString.Unsafe as B
import Prelude as P hiding ( lookup )
instance Link PTX where
data ExecutableR PTX = PTXR { ptxExecutable :: {-# UNPACK #-} !(Lifetime FunctionTable)
}
linkForTarget = link
link :: ObjectR PTX -> LLVM PTX (ExecutableR PTX)
link (ObjectR uid cfg obj) = do
target <- gets llvmTarget
cache <- gets ptxKernelTable
funs <- liftIO $ dlsym uid cache $ do
jit <- B.unsafeUseAsCString obj $ \p -> CUDA.loadDataFromPtrEx (castPtr p) []
let mdl = CUDA.jitModule jit
nm <- FunctionTable `fmap` mapM (uncurry (linkFunction mdl)) cfg
oc <- newLifetime mdl
addFinalizer oc $ do
Debug.traceIO Debug.dump_gc ("gc: unload module: " ++ show nm)
withContext (ptxContext target) (CUDA.unload mdl)
return (nm, oc)
return $! PTXR funs
linkFunction
:: CUDA.Module
-> ShortByteString
-> LaunchConfig
-> IO Kernel
linkFunction mdl name configure =
fst `fmap` linkFunctionQ mdl name configure
linkFunctionQ
:: CUDA.Module
-> ShortByteString
-> LaunchConfig
-> IO (Kernel, Q (TExp (Int -> Int)))
linkFunctionQ mdl name configure = do
f <- CUDA.getFun mdl (unpack name)
regs <- CUDA.requires f CUDA.NumRegs
ssmem <- CUDA.requires f CUDA.SharedSizeBytes
cmem <- CUDA.requires f CUDA.ConstSizeBytes
lmem <- CUDA.requires f CUDA.LocalSizeBytes
maxt <- CUDA.requires f CUDA.MaxKernelThreadsPerBlock
let
(occ, cta, grid, dsmem, gridQ) = configure maxt regs ssmem
msg1, msg2 :: String
msg1 = printf "kernel function '%s' used %d registers, %d bytes smem, %d bytes lmem, %d bytes cmem"
(unpack name) regs (ssmem + dsmem) lmem cmem
msg2 = printf "multiprocessor occupancy %.1f %% : %d threads over %d warps in %d blocks"
(CUDA.occupancy100 occ)
(CUDA.activeThreads occ)
(CUDA.activeWarps occ)
(CUDA.activeThreadBlocks occ)
Debug.traceIO Debug.dump_cc (printf "cc: %s\n ... %s" msg1 msg2)
return (Kernel name f dsmem cta grid, gridQ)
withExecutable :: ExecutableR PTX -> (FunctionTable -> LLVM PTX b) -> LLVM PTX b
withExecutable PTXR{..} f = do
r <- f (unsafeGetValue ptxExecutable)
liftIO $ touchLifetime ptxExecutable
return r