{-# LANGUAGE TypeFamilies #-}

-- | A representation with flat parallelism via GPU-oriented kernels.
module Futhark.IR.GPU
  ( GPU,

    -- * Module re-exports
    module Futhark.IR.Prop,
    module Futhark.IR.Traversals,
    module Futhark.IR.Pretty,
    module Futhark.IR.Syntax,
    module Futhark.IR.GPU.Op,
    module Futhark.IR.GPU.Sizes,
    module Futhark.IR.SOACS.SOAC,
  )
where

import Futhark.Builder
import Futhark.Construct
import Futhark.IR.Aliases (Aliases)
import Futhark.IR.GPU.Op
import Futhark.IR.GPU.Sizes
import Futhark.IR.Pretty
import Futhark.IR.Prop
import Futhark.IR.SOACS.SOAC hiding (HistOp (..))
import Futhark.IR.Syntax
import Futhark.IR.Traversals
import Futhark.IR.TypeCheck qualified as TC

-- | The phantom data type for the kernels representation.
data GPU

instance RepTypes GPU where
  type OpC GPU = HostOp SOAC

instance ASTRep GPU where
  expTypesFromPat = pure . expExtTypesFromPat

instance TC.Checkable GPU where
  checkOp = typeCheckGPUOp Nothing
    where
      -- GHC 9.2 goes into an infinite loop without the type annotation.
      typeCheckGPUOp ::
        Maybe SegLevel ->
        HostOp SOAC (Aliases GPU) ->
        TC.TypeM GPU ()
      typeCheckGPUOp lvl =
        typeCheckHostOp (typeCheckGPUOp . Just) lvl typeCheckSOAC

instance Buildable GPU where
  mkBody = Body ()
  mkExpPat idents _ = basicPat idents
  mkExpDec _ _ = ()
  mkLetNames = simpleMkLetNames

instance BuilderOps GPU

instance PrettyRep GPU

instance HasSegOp GPU where
  type SegOpLevel GPU = SegLevel
  asSegOp (SegOp op) = Just op
  asSegOp _ = Nothing
  segOp = SegOp

-- Note [GPU Terminology]
--
-- For lack of a better spot to put it, this Note summarises the
-- terminology used for GPU concepts in the Futhark compiler. The
-- terminology is based on CUDA terminology, and tries to match it as
-- closely as possible. However, this was not always the case (issue
-- #2062), so you may find some code that uses e.g. OpenCL
-- terminology. In most cases there is no ambiguity, but there are a
-- few instances where the same term is used for different things.
-- Please fix any instances you find.
--
-- The terminology is as follows:
--
-- Host: Essentially the CPU; whatever is controlling the GPU.
--
-- Kernel: A GPU program that can be launched from the host.
--
-- Grid: The geometry of the thread blocks launched for a kernel. The
-- size of a grid is always in terms of the number of thread blocks
-- ("grid size"). A grid can have up to 3 dimensions, although we do
-- not make much use of it - and not at all prior to code generation.
--
-- Thread block: Just as in CUDA. "Workgroup" in OpenCL. Abbretiation:
-- tblock. Never just call this "block"; there are too many things
-- called "block". Must match the dimensionality of the grid.
--
-- Thread: Just as in CUDA.  "Workitem" in OpenCL.
--
-- Global thread identifier: A globally unique number for a thread
-- along one dimension. Abbreviation: gtid. We also use this term for
-- the identifiers bound by SegOps. In OpenCL, corresponds to
-- get_global_id(). (Except when we virtualise the thread space.)
--
-- Local thread identifier: A locally unique number (within the thread
-- block) for each thread. Abbreviation: ltid. In OpenCL, corresponds
-- to get_local_id().  In CUDA, corresponds to threadIdx.
--
-- Thread block identifier: A number unique to each thread block in a
-- single dimension.  In CUDA, corresponds to blockIdx.
--
-- Local memory: Thread-local private memory. In CUDA, this is
-- sometimes put in registers (if you are very careful in how you use
-- it). In OpenCL, this is called "private memory", and "local memory"
-- is something else entirely.
--
-- Shared memory: Just as in CUDA. Fast scratchpad memory accessible
-- to all threads within the same thread block. In OpenCL, this is
-- "local memory".
--
-- Device memory: Sometimes also called "global memory"; this is the
-- big-but-slow memory on the GPU.