{-# LANGUAGE TypeFamilies #-}

-- | A representation with flat parallelism via GPU-oriented kernels.
module Futhark.IR.GPU
  ( GPU,

    -- * Module re-exports
    module Futhark.IR.Prop,
    module Futhark.IR.Traversals,
    module Futhark.IR.Pretty,
    module Futhark.IR.Syntax,
    module Futhark.IR.GPU.Op,
    module Futhark.IR.GPU.Sizes,
    module Futhark.IR.SOACS.SOAC,
  )
where

import Futhark.Builder
import Futhark.Construct
import Futhark.IR.Aliases (Aliases)
import Futhark.IR.GPU.Op
import Futhark.IR.GPU.Sizes
import Futhark.IR.Pretty
import Futhark.IR.Prop
import Futhark.IR.SOACS.SOAC hiding (HistOp (..))
import Futhark.IR.Syntax
import Futhark.IR.Traversals
import Futhark.IR.TypeCheck qualified as TC

-- | The phantom data type for the kernels representation.
data GPU

instance RepTypes GPU where
  type OpC GPU = HostOp SOAC

instance ASTRep GPU where
  expTypesFromPat :: forall (m :: * -> *).
(HasScope GPU m, Monad m) =>
Pat (LetDec GPU) -> m [BranchType GPU]
expTypesFromPat = [ExtType] -> m [ExtType]
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([ExtType] -> m [ExtType])
-> (Pat Type -> [ExtType]) -> Pat Type -> m [ExtType]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Pat Type -> [ExtType]
forall dec. Typed dec => Pat dec -> [ExtType]
expExtTypesFromPat

instance TC.Checkable GPU where
  checkOp :: Op (Aliases GPU) -> TypeM GPU ()
checkOp = Maybe SegLevel -> HostOp SOAC (Aliases GPU) -> TypeM GPU ()
typeCheckGPUOp Maybe SegLevel
forall a. Maybe a
Nothing
    where
      -- GHC 9.2 goes into an infinite loop without the type annotation.
      typeCheckGPUOp ::
        Maybe SegLevel ->
        HostOp SOAC (Aliases GPU) ->
        TC.TypeM GPU ()
      typeCheckGPUOp :: Maybe SegLevel -> HostOp SOAC (Aliases GPU) -> TypeM GPU ()
typeCheckGPUOp Maybe SegLevel
lvl =
        (SegLevel -> Op (Aliases GPU) -> TypeM GPU ())
-> Maybe SegLevel
-> (SOAC (Aliases GPU) -> TypeM GPU ())
-> HostOp SOAC (Aliases GPU)
-> TypeM GPU ()
forall rep (op :: * -> *).
Checkable rep =>
(SegLevel -> Op (Aliases rep) -> TypeM rep ())
-> Maybe SegLevel
-> (op (Aliases rep) -> TypeM rep ())
-> HostOp op (Aliases rep)
-> TypeM rep ()
typeCheckHostOp (Maybe SegLevel -> HostOp SOAC (Aliases GPU) -> TypeM GPU ()
typeCheckGPUOp (Maybe SegLevel -> HostOp SOAC (Aliases GPU) -> TypeM GPU ())
-> (SegLevel -> Maybe SegLevel)
-> SegLevel
-> HostOp SOAC (Aliases GPU)
-> TypeM GPU ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. SegLevel -> Maybe SegLevel
forall a. a -> Maybe a
Just) Maybe SegLevel
lvl SOAC (Aliases GPU) -> TypeM GPU ()
forall rep. Checkable rep => SOAC (Aliases rep) -> TypeM rep ()
typeCheckSOAC

instance Buildable GPU where
  mkBody :: Stms GPU -> Result -> Body GPU
mkBody = BodyDec GPU -> Stms GPU -> Result -> Body GPU
forall rep. BodyDec rep -> Stms rep -> Result -> Body rep
Body ()
  mkExpPat :: [Ident] -> Exp GPU -> Pat (LetDec GPU)
mkExpPat [Ident]
idents Exp GPU
_ = [Ident] -> Pat Type
basicPat [Ident]
idents
  mkExpDec :: Pat (LetDec GPU) -> Exp GPU -> ExpDec GPU
mkExpDec Pat (LetDec GPU)
_ Exp GPU
_ = ()
  mkLetNames :: forall (m :: * -> *).
(MonadFreshNames m, HasScope GPU m) =>
[VName] -> Exp GPU -> m (Stm GPU)
mkLetNames = [VName] -> Exp GPU -> m (Stm GPU)
forall rep (m :: * -> *).
(ExpDec rep ~ (), LetDec rep ~ Type, MonadFreshNames m,
 TypedOp (Op rep), HasScope rep m) =>
[VName] -> Exp rep -> m (Stm rep)
simpleMkLetNames

instance BuilderOps GPU

instance PrettyRep GPU

instance HasSegOp GPU where
  type SegOpLevel GPU = SegLevel
  asSegOp :: Op GPU -> Maybe (SegOp (SegOpLevel GPU) GPU)
asSegOp (SegOp SegOp SegLevel GPU
op) = SegOp SegLevel GPU -> Maybe (SegOp SegLevel GPU)
forall a. a -> Maybe a
Just SegOp SegLevel GPU
op
  asSegOp Op GPU
_ = Maybe (SegOp (SegOpLevel GPU) GPU)
Maybe (SegOp SegLevel GPU)
forall a. Maybe a
Nothing
  segOp :: SegOp (SegOpLevel GPU) GPU -> Op GPU
segOp = SegOp (SegOpLevel GPU) GPU -> Op GPU
SegOp SegLevel GPU -> HostOp SOAC GPU
forall (op :: * -> *) rep. SegOp SegLevel rep -> HostOp op rep
SegOp

-- Note [GPU Terminology]
--
-- For lack of a better spot to put it, this Note summarises the
-- terminology used for GPU concepts in the Futhark compiler. The
-- terminology is based on CUDA terminology, and tries to match it as
-- closely as possible. However, this was not always the case (issue
-- #2062), so you may find some code that uses e.g. OpenCL
-- terminology. In most cases there is no ambiguity, but there are a
-- few instances where the same term is used for different things.
-- Please fix any instances you find.
--
-- The terminology is as follows:
--
-- Host: Essentially the CPU; whatever is controlling the GPU.
--
-- Kernel: A GPU program that can be launched from the host.
--
-- Grid: The geometry of the thread blocks launched for a kernel. The
-- size of a grid is always in terms of the number of thread blocks
-- ("grid size"). A grid can have up to 3 dimensions, although we do
-- not make much use of it - and not at all prior to code generation.
--
-- Thread block: Just as in CUDA. "Workgroup" in OpenCL. Abbretiation:
-- tblock. Never just call this "block"; there are too many things
-- called "block". Must match the dimensionality of the grid.
--
-- Thread: Just as in CUDA.  "Workitem" in OpenCL.
--
-- Global thread identifier: A globally unique number for a thread
-- along one dimension. Abbreviation: gtid. We also use this term for
-- the identifiers bound by SegOps. In OpenCL, corresponds to
-- get_global_id(). (Except when we virtualise the thread space.)
--
-- Local thread identifier: A locally unique number (within the thread
-- block) for each thread. Abbreviation: ltid. In OpenCL, corresponds
-- to get_local_id().  In CUDA, corresponds to threadIdx.
--
-- Thread block identifier: A number unique to each thread block in a
-- single dimension.  In CUDA, corresponds to blockIdx.
--
-- Local memory: Thread-local private memory. In CUDA, this is
-- sometimes put in registers (if you are very careful in how you use
-- it). In OpenCL, this is called "private memory", and "local memory"
-- is something else entirely.
--
-- Shared memory: Just as in CUDA. Fast scratchpad memory accessible
-- to all threads within the same thread block. In OpenCL, this is
-- "local memory".
--
-- Device memory: Sometimes also called "global memory"; this is the
-- big-but-slow memory on the GPU.