{-# LANGUAGE MagicHash #-} {-# LANGUAGE RecordWildCards #-} -- | -- Module : Data.Array.Accelerate.LLVM.PTX.Context -- Copyright : [2014..2020] The Accelerate Team -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.PTX.Context ( Context(..), new, raw, withContext, ) where import Data.Array.Accelerate.Lifetime import Data.Array.Accelerate.LLVM.PTX.Analysis.Device import qualified Data.Array.Accelerate.LLVM.PTX.Debug as Debug import qualified Foreign.CUDA.Driver.Device as CUDA import qualified Foreign.CUDA.Driver.Context as CUDA import Control.Concurrent import Control.Exception import Control.Monad import Data.Hashable import Text.PrettyPrint import Prelude hiding ( (<>) ) import GHC.Base ( Int(..), addr2Int#, ) import GHC.Ptr ( Ptr(..) ) -- | An execution context, which is tied to a specific device and CUDA execution -- context. -- data Context = Context { deviceProperties :: {-# UNPACK #-} !CUDA.DeviceProperties -- information on hardware resources , deviceContext :: {-# UNPACK #-} !(Lifetime CUDA.Context) -- device execution context } instance Eq Context where c1 == c2 = deviceContext c1 == deviceContext c2 instance Hashable Context where hashWithSalt salt = let ptrToInt :: Ptr a -> Int ptrToInt (Ptr addr#) = I# (addr2Int# addr#) in hashWithSalt salt . ptrToInt . CUDA.useContext . unsafeGetValue . deviceContext -- | Create a new CUDA execution context -- new :: CUDA.Device -> CUDA.DeviceProperties -> [CUDA.ContextFlag] -> IO Context new dev prp flags = do ctx <- raw dev prp =<< CUDA.create dev flags _ <- CUDA.pop return ctx -- | Wrap a raw CUDA execution context -- raw :: CUDA.Device -> CUDA.DeviceProperties -> CUDA.Context -> IO Context raw dev prp ctx = do lft <- newLifetime ctx addFinalizer lft $ do message $ "finalise context " ++ showContext ctx CUDA.destroy ctx -- The kernels don't use much shared memory, so for devices that support it -- prefer using those memory banks as an L1 cache. -- -- TLM: Is this a good idea? For example, external libraries such as cuBLAS -- rely heavily on shared memory and thus this could adversely affect -- performance. Perhaps we should use 'setCacheConfigFun' for individual -- functions which might benefit from this. -- when (CUDA.computeCapability prp >= CUDA.Compute 2 0) (CUDA.setCache CUDA.PreferL1) -- Display information about the selected device Debug.traceIO Debug.dump_phases (deviceInfo dev prp) return $! Context prp lft -- | Push the context onto the CPUs thread stack of current contexts and execute -- some operation. -- {-# INLINE withContext #-} withContext :: Context -> IO a -> IO a withContext Context{..} action = runInBoundThread $ withLifetime deviceContext $ \ctx -> bracket_ (push ctx) pop action {-# INLINE push #-} push :: CUDA.Context -> IO () push ctx = do message $ "push context: " ++ showContext ctx CUDA.push ctx {-# INLINE pop #-} pop :: IO () pop = do ctx <- CUDA.pop message $ "pop context: " ++ showContext ctx -- Debugging -- --------- -- Nicely format a summary of the selected CUDA device, example: -- -- Device 0: GeForce 9600M GT (compute capability 1.1), 4 multiprocessors @ 1.25GHz (32 cores), 512MB global memory -- deviceInfo :: CUDA.Device -> CUDA.DeviceProperties -> String deviceInfo dev prp = render $ devID <> colon <+> name <+> parens compute <> comma <+> processors <+> at <+> text clock <+> parens cores <> comma <+> memory where name = text (CUDA.deviceName prp) compute = text "compute capability" <+> text (show $ CUDA.computeCapability prp) devID = text "device" <+> int (fromIntegral $ CUDA.useDevice dev) processors = int (CUDA.multiProcessorCount prp) <+> text "multiprocessors" cores = int (CUDA.multiProcessorCount prp * coresPerMultiProcessor prp) <+> text "cores" memory = text mem <+> text "global memory" -- clock = Debug.showFFloatSIBase (Just 2) 1000 (fromIntegral $ CUDA.clockRate prp * 1000 :: Double) "Hz" mem = Debug.showFFloatSIBase (Just 0) 1024 (fromIntegral $ CUDA.totalGlobalMem prp :: Double) "B" at = char '@' -- reset = zeroWidthText "\r" {-# INLINE trace #-} trace :: String -> IO a -> IO a trace msg next = do Debug.traceIO Debug.dump_gc ("gc: " ++ msg) next {-# INLINE message #-} message :: String -> IO () message s = s `trace` return () {-# INLINE showContext #-} showContext :: CUDA.Context -> String showContext (CUDA.Context c) = show c