{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE UndecidableInstances #-} module Futhark.IR.GPU.Op ( -- * Size operations SizeOp (..), -- * Host operations HostOp (..), traverseHostOpStms, typeCheckHostOp, -- * SegOp refinements SegLevel (..), segVirt, SegVirt (..), SegSeqDims (..), KernelGrid (..), -- * Reexports module Futhark.IR.GPU.Sizes, module Futhark.IR.SegOp, ) where import Control.Monad import Data.Sequence qualified as SQ import Data.Text qualified as T import Futhark.Analysis.Alias qualified as Alias import Futhark.Analysis.Metrics import Futhark.Analysis.SymbolTable qualified as ST import Futhark.IR import Futhark.IR.Aliases (Aliases, CanBeAliased (..)) import Futhark.IR.GPU.Sizes import Futhark.IR.Mem (OpReturns (..), extReturns) import Futhark.IR.Prop.Aliases import Futhark.IR.SegOp import Futhark.IR.TypeCheck qualified as TC import Futhark.Optimise.Simplify.Engine qualified as Engine import Futhark.Optimise.Simplify.Rep import Futhark.Transform.Rename import Futhark.Transform.Substitute import Futhark.Util.Pretty ( commasep, parens, ppTuple', pretty, (<+>), ) import Futhark.Util.Pretty qualified as PP -- | These dimensions (indexed from 0, outermost) of the corresponding -- 'SegSpace' should not be parallelised, but instead iterated -- sequentially. For example, with a 'SegSeqDims' of @[0]@ and a -- 'SegSpace' with dimensions @[n][m]@, there will be an outer loop -- with @n@ iterations, while the @m@ dimension will be parallelised. -- -- Semantically, this has no effect, but it may allow reductions in -- memory usage or other low-level optimisations. Operationally, the -- guarantee is that for a SegSeqDims of e.g. @[i,j,k]@, threads -- running at any given moment will always have the same indexes along -- the dimensions specified by @[i,j,k]@. -- -- At the moment, this is only supported for 'SegNoVirtFull' -- intra-block parallelism in GPU code, as we have not yet found it -- useful anywhere else. newtype SegSeqDims = SegSeqDims {segSeqDims :: [Int]} deriving (Eq, Ord, Show) -- | Do we need block-virtualisation when generating code for the -- segmented operation? In most cases, we do, but for some simple -- kernels, we compute the full number of blocks in advance, and then -- virtualisation is an unnecessary (but generally very small) -- overhead. This only really matters for fairly trivial but very -- wide @map@ kernels where each thread performs constant-time work on -- scalars. data SegVirt = SegVirt | SegNoVirt | -- | Not only do we not need virtualisation, but we _guarantee_ -- that all physical threads participate in the work. This can -- save some checks in code generation. SegNoVirtFull SegSeqDims deriving (Eq, Ord, Show) -- | The actual, physical grid dimensions used for the GPU kernel -- running this 'SegOp'. data KernelGrid = KernelGrid { gridNumBlocks :: Count NumBlocks SubExp, gridBlockSize :: Count BlockSize SubExp } deriving (Eq, Ord, Show) -- | At which level the *body* of a t'SegOp' executes. data SegLevel = SegThread SegVirt (Maybe KernelGrid) | SegBlock SegVirt (Maybe KernelGrid) | SegThreadInBlock SegVirt deriving (Eq, Ord, Show) -- | The 'SegVirt' of the 'SegLevel'. segVirt :: SegLevel -> SegVirt segVirt (SegThread v _) = v segVirt (SegBlock v _) = v segVirt (SegThreadInBlock v) = v instance PP.Pretty SegVirt where pretty SegNoVirt = mempty pretty (SegNoVirtFull dims) = "full" <+> pretty (segSeqDims dims) pretty SegVirt = "virtualise" instance PP.Pretty KernelGrid where pretty (KernelGrid num_tblocks tblock_size) = "grid=" <> pretty num_tblocks <> PP.semi <+> "blocksize=" <> pretty tblock_size instance PP.Pretty SegLevel where pretty (SegThread virt grid) = PP.parens ("thread" <> PP.semi <+> pretty virt <> PP.semi <+> pretty grid) pretty (SegBlock virt grid) = PP.parens ("block" <> PP.semi <+> pretty virt <> PP.semi <+> pretty grid) pretty (SegThreadInBlock virt) = PP.parens ("inblock" <> PP.semi <+> pretty virt) instance Engine.Simplifiable KernelGrid where simplify (KernelGrid num_tblocks tblock_size) = KernelGrid <$> traverse Engine.simplify num_tblocks <*> traverse Engine.simplify tblock_size instance Engine.Simplifiable SegLevel where simplify (SegThread virt grid) = SegThread virt <$> Engine.simplify grid simplify (SegBlock virt grid) = SegBlock virt <$> Engine.simplify grid simplify (SegThreadInBlock virt) = pure $ SegThreadInBlock virt instance Substitute KernelGrid where substituteNames substs (KernelGrid num_tblocks tblock_size) = KernelGrid (substituteNames substs num_tblocks) (substituteNames substs tblock_size) instance Substitute SegLevel where substituteNames substs (SegThread virt grid) = SegThread virt (substituteNames substs grid) substituteNames substs (SegBlock virt grid) = SegBlock virt (substituteNames substs grid) substituteNames _ (SegThreadInBlock virt) = SegThreadInBlock virt instance Rename SegLevel where rename = substituteRename instance FreeIn KernelGrid where freeIn' (KernelGrid num_tblocks tblock_size) = freeIn' (num_tblocks, tblock_size) instance FreeIn SegLevel where freeIn' (SegThread _virt grid) = freeIn' grid freeIn' (SegBlock _virt grid) = freeIn' grid freeIn' (SegThreadInBlock _virt) = mempty -- | A simple size-level query or computation. data SizeOp = -- | Produce some runtime-configurable size. GetSize Name SizeClass | -- | The maximum size of some class. GetSizeMax SizeClass | -- | Compare size (likely a threshold) with some integer value. CmpSizeLe Name SizeClass SubExp | -- | @CalcNumBlocks w max_num_tblocks tblock_size@ calculates the -- number of GPU threadblocks to use for an input of the given size. -- The @Name@ is a size name. Note that @w@ is an i64 to avoid -- overflow issues. CalcNumBlocks SubExp Name SubExp deriving (Eq, Ord, Show) instance Substitute SizeOp where substituteNames substs (CmpSizeLe name sclass x) = CmpSizeLe name sclass (substituteNames substs x) substituteNames substs (CalcNumBlocks w max_num_tblocks tblock_size) = CalcNumBlocks (substituteNames substs w) max_num_tblocks (substituteNames substs tblock_size) substituteNames _ op = op instance Rename SizeOp where rename (CmpSizeLe name sclass x) = CmpSizeLe name sclass <$> rename x rename (CalcNumBlocks w max_num_tblocks tblock_size) = CalcNumBlocks <$> rename w <*> pure max_num_tblocks <*> rename tblock_size rename x = pure x instance FreeIn SizeOp where freeIn' (CmpSizeLe _ _ x) = freeIn' x freeIn' (CalcNumBlocks w _ tblock_size) = freeIn' w <> freeIn' tblock_size freeIn' _ = mempty instance PP.Pretty SizeOp where pretty (GetSize name size_class) = "get_size" <> parens (commasep [pretty name, pretty size_class]) pretty (GetSizeMax size_class) = "get_size_max" <> parens (commasep [pretty size_class]) pretty (CmpSizeLe name size_class x) = "cmp_size" <> parens (commasep [pretty name, pretty size_class]) <+> "<=" <+> pretty x pretty (CalcNumBlocks w max_num_tblocks tblock_size) = "calc_num_tblocks" <> parens (commasep [pretty w, pretty max_num_tblocks, pretty tblock_size]) instance OpMetrics SizeOp where opMetrics GetSize {} = seen "GetSize" opMetrics GetSizeMax {} = seen "GetSizeMax" opMetrics CmpSizeLe {} = seen "CmpSizeLe" opMetrics CalcNumBlocks {} = seen "CalcNumBlocks" typeCheckSizeOp :: (TC.Checkable rep) => SizeOp -> TC.TypeM rep () typeCheckSizeOp GetSize {} = pure () typeCheckSizeOp GetSizeMax {} = pure () typeCheckSizeOp (CmpSizeLe _ _ x) = TC.require [Prim int64] x typeCheckSizeOp (CalcNumBlocks w _ tblock_size) = do TC.require [Prim int64] w TC.require [Prim int64] tblock_size -- | A host-level operation; parameterised by what else it can do. data HostOp op rep = -- | A segmented operation. SegOp (SegOp SegLevel rep) | SizeOp SizeOp | OtherOp (op rep) | -- | Code to run sequentially on the GPU, -- in a single thread. GPUBody [Type] (Body rep) deriving (Eq, Ord, Show) -- | A helper for defining 'TraverseOpStms'. traverseHostOpStms :: (Monad m) => OpStmsTraverser m (op rep) rep -> OpStmsTraverser m (HostOp op rep) rep traverseHostOpStms _ f (SegOp segop) = SegOp <$> traverseSegOpStms f segop traverseHostOpStms _ _ (SizeOp sizeop) = pure $ SizeOp sizeop traverseHostOpStms onOtherOp f (OtherOp other) = OtherOp <$> onOtherOp f other traverseHostOpStms _ f (GPUBody ts body) = do stms <- f mempty $ bodyStms body pure $ GPUBody ts $ body {bodyStms = stms} instance (ASTRep rep, Substitute (op rep)) => Substitute (HostOp op rep) where substituteNames substs (SegOp op) = SegOp $ substituteNames substs op substituteNames substs (OtherOp op) = OtherOp $ substituteNames substs op substituteNames substs (SizeOp op) = SizeOp $ substituteNames substs op substituteNames substs (GPUBody ts body) = GPUBody (substituteNames substs ts) (substituteNames substs body) instance (ASTRep rep, Rename (op rep)) => Rename (HostOp op rep) where rename (SegOp op) = SegOp <$> rename op rename (OtherOp op) = OtherOp <$> rename op rename (SizeOp op) = SizeOp <$> rename op rename (GPUBody ts body) = GPUBody <$> rename ts <*> rename body instance (IsOp op) => IsOp (HostOp op) where safeOp (SegOp op) = safeOp op safeOp (OtherOp op) = safeOp op safeOp (SizeOp _) = True safeOp (GPUBody _ body) = all (safeExp . stmExp) $ bodyStms body cheapOp (SegOp op) = cheapOp op cheapOp (OtherOp op) = cheapOp op cheapOp (SizeOp _) = True cheapOp (GPUBody types body) = -- Current GPUBody usage only benefits from hoisting kernels that -- transfer scalars to device. SQ.null (bodyStms body) && all ((== 0) . arrayRank) types opDependencies (SegOp op) = opDependencies op opDependencies (OtherOp op) = opDependencies op opDependencies (SizeOp op) = [freeIn op] opDependencies (GPUBody _ body) = replicate (length . bodyResult $ body) (freeIn body) instance (TypedOp op) => TypedOp (HostOp op) where opType (SegOp op) = opType op opType (OtherOp op) = opType op opType (SizeOp (GetSize _ _)) = pure [Prim int64] opType (SizeOp (GetSizeMax _)) = pure [Prim int64] opType (SizeOp CmpSizeLe {}) = pure [Prim Bool] opType (SizeOp (CalcNumBlocks {})) = pure [Prim int64] opType (GPUBody ts _) = pure $ staticShapes $ map (`arrayOfRow` intConst Int64 1) ts instance (AliasedOp op) => AliasedOp (HostOp op) where opAliases (SegOp op) = opAliases op opAliases (OtherOp op) = opAliases op opAliases (SizeOp _) = [mempty] opAliases (GPUBody ts _) = map (const mempty) ts consumedInOp (SegOp op) = consumedInOp op consumedInOp (OtherOp op) = consumedInOp op consumedInOp (SizeOp _) = mempty consumedInOp (GPUBody _ body) = consumedInBody body instance (ASTRep rep, FreeIn (op rep)) => FreeIn (HostOp op rep) where freeIn' (SegOp op) = freeIn' op freeIn' (OtherOp op) = freeIn' op freeIn' (SizeOp op) = freeIn' op freeIn' (GPUBody ts body) = freeIn' ts <> freeIn' body instance (CanBeAliased op) => CanBeAliased (HostOp op) where addOpAliases aliases (SegOp op) = SegOp $ addOpAliases aliases op addOpAliases aliases (GPUBody ts body) = GPUBody ts $ Alias.analyseBody aliases body addOpAliases aliases (OtherOp op) = OtherOp $ addOpAliases aliases op addOpAliases _ (SizeOp op) = SizeOp op instance (CanBeWise op) => CanBeWise (HostOp op) where addOpWisdom (SegOp op) = SegOp $ addOpWisdom op addOpWisdom (OtherOp op) = OtherOp $ addOpWisdom op addOpWisdom (SizeOp op) = SizeOp op addOpWisdom (GPUBody ts body) = GPUBody ts $ informBody body instance OpReturns (HostOp NoOp) where opReturns (SegOp op) = segOpReturns op opReturns k = extReturns <$> opType k instance (ASTRep rep, ST.IndexOp (op rep)) => ST.IndexOp (HostOp op rep) where indexOp vtable k (SegOp op) is = ST.indexOp vtable k op is indexOp vtable k (OtherOp op) is = ST.indexOp vtable k op is indexOp _ _ _ _ = Nothing instance (PrettyRep rep, PP.Pretty (op rep)) => PP.Pretty (HostOp op rep) where pretty (SegOp op) = pretty op pretty (OtherOp op) = pretty op pretty (SizeOp op) = pretty op pretty (GPUBody ts body) = "gpu" <+> PP.colon <+> ppTuple' (map pretty ts) <+> PP.nestedBlock "{" "}" (pretty body) instance (OpMetrics (Op rep), OpMetrics (op rep)) => OpMetrics (HostOp op rep) where opMetrics (SegOp op) = opMetrics op opMetrics (OtherOp op) = opMetrics op opMetrics (SizeOp op) = opMetrics op opMetrics (GPUBody _ body) = inside "GPUBody" $ bodyMetrics body instance (RephraseOp op) => RephraseOp (HostOp op) where rephraseInOp r (SegOp op) = SegOp <$> rephraseInOp r op rephraseInOp r (OtherOp op) = OtherOp <$> rephraseInOp r op rephraseInOp _ (SizeOp op) = pure $ SizeOp op rephraseInOp r (GPUBody ts body) = GPUBody ts <$> rephraseBody r body checkGrid :: (TC.Checkable rep) => KernelGrid -> TC.TypeM rep () checkGrid grid = do TC.require [Prim int64] $ unCount $ gridNumBlocks grid TC.require [Prim int64] $ unCount $ gridBlockSize grid checkSegLevel :: (TC.Checkable rep) => Maybe SegLevel -> SegLevel -> TC.TypeM rep () checkSegLevel (Just SegBlock {}) (SegThreadInBlock _virt) = pure () checkSegLevel _ (SegThreadInBlock _virt) = TC.bad $ TC.TypeError "inblock SegOp not in block SegOp." checkSegLevel (Just SegThread {}) _ = TC.bad $ TC.TypeError "SegOps cannot occur when already at thread level." checkSegLevel (Just SegThreadInBlock {}) _ = TC.bad $ TC.TypeError "SegOps cannot occur when already at inblock level." checkSegLevel _ (SegThread _virt Nothing) = pure () checkSegLevel (Just _) SegThread {} = TC.bad $ TC.TypeError "thread-level SegOp cannot be nested" checkSegLevel Nothing (SegThread _virt grid) = mapM_ checkGrid grid checkSegLevel (Just _) SegBlock {} = TC.bad $ TC.TypeError "block-level SegOp cannot be nested" checkSegLevel Nothing (SegBlock _virt grid) = mapM_ checkGrid grid typeCheckHostOp :: (TC.Checkable rep) => (SegLevel -> Op (Aliases rep) -> TC.TypeM rep ()) -> Maybe SegLevel -> (op (Aliases rep) -> TC.TypeM rep ()) -> HostOp op (Aliases rep) -> TC.TypeM rep () typeCheckHostOp checker lvl _ (SegOp op) = TC.checkOpWith (checker $ segLevel op) $ typeCheckSegOp (checkSegLevel lvl) op typeCheckHostOp _ Just {} _ GPUBody {} = TC.bad $ TC.TypeError "GPUBody may not be nested in SegOps." typeCheckHostOp _ _ f (OtherOp op) = f op typeCheckHostOp _ _ _ (SizeOp op) = typeCheckSizeOp op typeCheckHostOp _ Nothing _ (GPUBody ts body) = do mapM_ TC.checkType ts void $ TC.checkBody body body_ts <- extendedScope (traverse subExpResType (bodyResult body)) (scopeOf (bodyStms body)) unless (body_ts == ts) . TC.bad . TC.TypeError . T.unlines $ [ "Expected type: " <> prettyTuple ts, "Got body type: " <> prettyTuple body_ts ]