{-# LANGUAGE QuasiQuotes #-}

-- | This module defines a translation from imperative code with
-- kernels to imperative code with OpenCL or CUDA calls.
module Futhark.CodeGen.ImpGen.GPU.ToOpenCL
  ( kernelsToOpenCL,
    kernelsToCUDA,
    kernelsToHIP,
  )
where

import Control.Monad
import Control.Monad.Identity
import Control.Monad.Reader
import Control.Monad.State
import Data.Bifunctor (second)
import Data.Foldable (toList)
import Data.Map.Strict qualified as M
import Data.Maybe
import Data.Set qualified as S
import Data.Text qualified as T
import Futhark.CodeGen.Backends.GenericC.Fun qualified as GC
import Futhark.CodeGen.Backends.GenericC.Pretty
import Futhark.CodeGen.Backends.SimpleRep
import Futhark.CodeGen.ImpCode.GPU hiding (Program)
import Futhark.CodeGen.ImpCode.GPU qualified as ImpGPU
import Futhark.CodeGen.ImpCode.OpenCL hiding (Program)
import Futhark.CodeGen.ImpCode.OpenCL qualified as ImpOpenCL
import Futhark.CodeGen.RTS.C (atomicsH, halfH)
import Futhark.CodeGen.RTS.CUDA (preludeCU)
import Futhark.CodeGen.RTS.OpenCL (copyCL, preludeCL, transposeCL)
import Futhark.Error (compilerLimitationS)
import Futhark.MonadFreshNames
import Futhark.Util (mapAccumLM, zEncodeText)
import Futhark.Util.IntegralExp (rem)
import Language.C.Quote.OpenCL qualified as C
import Language.C.Syntax qualified as C
import NeatInterpolation (untrimming)
import Prelude hiding (rem)

-- | Generate HIP host and device code.
kernelsToHIP :: ImpGPU.Program -> ImpOpenCL.Program
kernelsToHIP = translateGPU TargetHIP

-- | Generate CUDA host and device code.
kernelsToCUDA :: ImpGPU.Program -> ImpOpenCL.Program
kernelsToCUDA = translateGPU TargetCUDA

-- | Generate OpenCL host and device code.
kernelsToOpenCL :: ImpGPU.Program -> ImpOpenCL.Program
kernelsToOpenCL = translateGPU TargetOpenCL

-- | Translate a kernels-program to an OpenCL-program.
translateGPU ::
  KernelTarget ->
  ImpGPU.Program ->
  ImpOpenCL.Program
translateGPU target prog =
  let env = envFromProg prog
      ( prog',
        ToOpenCL kernels device_funs used_types sizes failures
        ) =
          (`runState` initialOpenCL) . (`runReaderT` env) $ do
            let ImpGPU.Definitions
                  types
                  (ImpGPU.Constants ps consts)
                  (ImpGPU.Functions funs) = prog
            consts' <- traverse (onHostOp target) consts
            funs' <- forM funs $ \(fname, fun) ->
              (fname,) <$> traverse (onHostOp target) fun

            pure $
              ImpOpenCL.Definitions
                types
                (ImpOpenCL.Constants ps consts')
                (ImpOpenCL.Functions funs')

      (device_prototypes, device_defs) = unzip $ M.elems device_funs
      kernels' = M.map fst kernels
      opencl_code = T.unlines $ map snd $ M.elems kernels

      opencl_prelude =
        T.unlines
          [ genPrelude target used_types,
            definitionsText device_prototypes,
            T.unlines device_defs
          ]
   in ImpOpenCL.Program
        opencl_code
        opencl_prelude
        kernels'
        (S.toList used_types)
        (findParamUsers env prog' (cleanSizes sizes))
        failures
        prog'
  where
    genPrelude TargetOpenCL = genOpenClPrelude
    genPrelude TargetCUDA = const genCUDAPrelude
    genPrelude TargetHIP = const genHIPPrelude

-- | Due to simplifications after kernel extraction, some threshold
-- parameters may contain KernelPaths that reference threshold
-- parameters that no longer exist.  We remove these here.
cleanSizes :: M.Map Name SizeClass -> M.Map Name SizeClass
cleanSizes m = M.map clean m
  where
    known = M.keys m
    clean (SizeThreshold path def) =
      SizeThreshold (filter ((`elem` known) . fst) path) def
    clean s = s

findParamUsers ::
  Env ->
  Definitions ImpOpenCL.OpenCL ->
  M.Map Name SizeClass ->
  ParamMap
findParamUsers env defs = M.mapWithKey onParam
  where
    cg = envCallGraph env

    getSize (ImpOpenCL.GetSize _ v) = Just v
    getSize (ImpOpenCL.CmpSizeLe _ v _) = Just v
    getSize (ImpOpenCL.GetSizeMax {}) = Nothing
    getSize (ImpOpenCL.LaunchKernel {}) = Nothing
    directUseInFun fun = mapMaybe getSize $ toList $ functionBody fun
    direct_uses = map (second directUseInFun) $ unFunctions $ defFuns defs

    calledBy fname = M.findWithDefault mempty fname cg
    indirectUseInFun fname =
      ( fname,
        foldMap snd $ filter ((`S.member` calledBy fname) . fst) direct_uses
      )
    indirect_uses = direct_uses <> map (indirectUseInFun . fst) direct_uses

    onParam k c = (c, S.fromList $ map fst $ filter ((k `elem`) . snd) indirect_uses)

pointerQuals :: String -> [C.TypeQual]
pointerQuals "global" = [C.ctyquals|__global|]
pointerQuals "local" = [C.ctyquals|__local|]
pointerQuals "private" = [C.ctyquals|__private|]
pointerQuals "constant" = [C.ctyquals|__constant|]
pointerQuals "write_only" = [C.ctyquals|__write_only|]
pointerQuals "read_only" = [C.ctyquals|__read_only|]
pointerQuals "kernel" = [C.ctyquals|__kernel|]
-- OpenCL does not actually have a "device" space, but we use it in
-- the compiler pipeline to defer to memory on the device, as opposed
-- to the host.  From a kernel's perspective, this is "global".
pointerQuals "device" = pointerQuals "global"
pointerQuals s = error $ "'" ++ s ++ "' is not an OpenCL kernel address space."

-- In-kernel name and per-workgroup size in bytes.
type LocalMemoryUse = (VName, Count Bytes (TExp Int64))

data KernelState = KernelState
  { kernelLocalMemory :: [LocalMemoryUse],
    kernelFailures :: [FailureMsg],
    kernelNextSync :: Int,
    -- | Has a potential failure occurred sine the last
    -- ErrorSync?
    kernelSyncPending :: Bool,
    kernelHasBarriers :: Bool
  }

newKernelState :: [FailureMsg] -> KernelState
newKernelState failures = KernelState mempty failures 0 False False

errorLabel :: KernelState -> String
errorLabel = ("error_" ++) . show . kernelNextSync

data ToOpenCL = ToOpenCL
  { clGPU :: M.Map KernelName (KernelSafety, T.Text),
    clDevFuns :: M.Map Name (C.Definition, T.Text),
    clUsedTypes :: S.Set PrimType,
    clSizes :: M.Map Name SizeClass,
    clFailures :: [FailureMsg]
  }

initialOpenCL :: ToOpenCL
initialOpenCL = ToOpenCL mempty mempty mempty mempty mempty

data Env = Env
  { envFuns :: ImpGPU.Functions ImpGPU.HostOp,
    envFunsMayFail :: S.Set Name,
    envCallGraph :: M.Map Name (S.Set Name)
  }

codeMayFail :: (a -> Bool) -> ImpGPU.Code a -> Bool
codeMayFail _ (Assert {}) = True
codeMayFail f (Op x) = f x
codeMayFail f (x :>>: y) = codeMayFail f x || codeMayFail f y
codeMayFail f (For _ _ x) = codeMayFail f x
codeMayFail f (While _ x) = codeMayFail f x
codeMayFail f (If _ x y) = codeMayFail f x || codeMayFail f y
codeMayFail f (Comment _ x) = codeMayFail f x
codeMayFail _ _ = False

hostOpMayFail :: ImpGPU.HostOp -> Bool
hostOpMayFail (CallKernel k) = codeMayFail kernelOpMayFail $ kernelBody k
hostOpMayFail _ = False

kernelOpMayFail :: ImpGPU.KernelOp -> Bool
kernelOpMayFail = const False

funsMayFail :: M.Map Name (S.Set Name) -> ImpGPU.Functions ImpGPU.HostOp -> S.Set Name
funsMayFail cg (Functions funs) =
  S.fromList $ map fst $ filter mayFail funs
  where
    base_mayfail =
      map fst $ filter (codeMayFail hostOpMayFail . ImpGPU.functionBody . snd) funs
    mayFail (fname, _) =
      any (`elem` base_mayfail) $ fname : S.toList (M.findWithDefault mempty fname cg)

envFromProg :: ImpGPU.Program -> Env
envFromProg prog = Env funs (funsMayFail cg funs) cg
  where
    funs = defFuns prog
    cg = ImpGPU.callGraph calledInHostOp funs

lookupFunction :: Name -> Env -> Maybe (ImpGPU.Function HostOp)
lookupFunction fname = lookup fname . unFunctions . envFuns

functionMayFail :: Name -> Env -> Bool
functionMayFail fname = S.member fname . envFunsMayFail

type OnKernelM = ReaderT Env (State ToOpenCL)

addSize :: Name -> SizeClass -> OnKernelM ()
addSize key sclass =
  modify $ \s -> s {clSizes = M.insert key sclass $ clSizes s}

onHostOp :: KernelTarget -> HostOp -> OnKernelM OpenCL
onHostOp target (CallKernel k) = onKernel target k
onHostOp _ (ImpGPU.GetSize v key size_class) = do
  addSize key size_class
  pure $ ImpOpenCL.GetSize v key
onHostOp _ (ImpGPU.CmpSizeLe v key size_class x) = do
  addSize key size_class
  pure $ ImpOpenCL.CmpSizeLe v key x
onHostOp _ (ImpGPU.GetSizeMax v size_class) =
  pure $ ImpOpenCL.GetSizeMax v size_class

genGPUCode ::
  Env ->
  OpsMode ->
  KernelCode ->
  [FailureMsg] ->
  GC.CompilerM KernelOp KernelState a ->
  (a, GC.CompilerState KernelState)
genGPUCode env mode body failures =
  GC.runCompilerM
    (inKernelOperations env mode body)
    blankNameSource
    (newKernelState failures)

-- Compilation of a device function that is not not invoked from the
-- host, but is invoked by (perhaps multiple) kernels.
generateDeviceFun :: Name -> ImpGPU.Function ImpGPU.KernelOp -> OnKernelM ()
generateDeviceFun fname device_func = do
  when (any memParam $ functionInput device_func) bad

  env <- ask
  failures <- gets clFailures

  let (func, kstate) =
        if functionMayFail fname env
          then
            let params =
                  [ [C.cparam|__global int *global_failure|],
                    [C.cparam|__global typename int64_t *global_failure_args|]
                  ]
                (f, cstate) =
                  genGPUCode env FunMode (declsFirst $ functionBody device_func) failures $
                    GC.compileFun mempty params (fname, device_func)
             in (f, GC.compUserState cstate)
          else
            let (f, cstate) =
                  genGPUCode env FunMode (declsFirst $ functionBody device_func) failures $
                    GC.compileVoidFun mempty (fname, device_func)
             in (f, GC.compUserState cstate)

  modify $ \s ->
    s
      { clUsedTypes = typesInCode (functionBody device_func) <> clUsedTypes s,
        clDevFuns = M.insert fname (second funcText func) $ clDevFuns s,
        clFailures = kernelFailures kstate
      }

  -- Important to do this after the 'modify' call, so we propagate the
  -- right clFailures.
  void $ ensureDeviceFuns $ functionBody device_func
  where
    memParam MemParam {} = True
    memParam ScalarParam {} = False

    bad = compilerLimitationS "Cannot generate GPU functions that use arrays."

-- Ensure that this device function is available, but don't regenerate
-- it if it already exists.
ensureDeviceFun :: Name -> ImpGPU.Function ImpGPU.KernelOp -> OnKernelM ()
ensureDeviceFun fname host_func = do
  exists <- gets $ M.member fname . clDevFuns
  unless exists $ generateDeviceFun fname host_func

calledInHostOp :: HostOp -> S.Set Name
calledInHostOp (CallKernel k) = calledFuncs calledInKernelOp $ kernelBody k
calledInHostOp _ = mempty

calledInKernelOp :: KernelOp -> S.Set Name
calledInKernelOp = const mempty

ensureDeviceFuns :: ImpGPU.KernelCode -> OnKernelM [Name]
ensureDeviceFuns code = do
  let called = calledFuncs calledInKernelOp code
  fmap catMaybes . forM (S.toList called) $ \fname -> do
    def <- asks $ lookupFunction fname
    case def of
      Just host_func -> do
        -- Functions are a priori always considered host-level, so we have
        -- to convert them to device code.  This is where most of our
        -- limitations on device-side functions (no arrays, no parallelism)
        -- comes from.
        let device_func = fmap toDevice host_func
        ensureDeviceFun fname device_func
        pure $ Just fname
      Nothing -> pure Nothing
  where
    bad = compilerLimitationS "Cannot generate GPU functions that contain parallelism."
    toDevice :: HostOp -> KernelOp
    toDevice _ = bad

isConst :: GroupDim -> Maybe T.Text
isConst (Left (ValueExp (IntValue x))) =
  Just $ prettyText $ intToInt64 x
isConst (Right (SizeConst v)) =
  Just $ zEncodeText $ nameToText v
isConst (Right (SizeMaxConst size_class)) =
  Just $ "max_" <> prettyText size_class
isConst _ = Nothing

onKernel :: KernelTarget -> Kernel -> OnKernelM OpenCL
onKernel target kernel = do
  called <- ensureDeviceFuns $ kernelBody kernel

  -- Crucial that this is done after 'ensureDeviceFuns', as the device
  -- functions may themselves define failure points.
  failures <- gets clFailures
  env <- ask

  let (kernel_body, cstate) =
        genGPUCode env KernelMode (kernelBody kernel) failures . GC.collect $ do
          body <- GC.collect $ GC.compileCode $ declsFirst $ kernelBody kernel
          -- No need to free, as we cannot allocate memory in kernels.
          mapM_ GC.item =<< GC.declAllocatedMem
          mapM_ GC.item body
      kstate = GC.compUserState cstate

      (const_defs, const_undefs) = unzip $ mapMaybe constDef $ kernelUses kernel

  let (local_memory_bytes, (local_memory_params, local_memory_args, local_memory_init)) =
        second unzip3 $
          evalState
            (mapAccumLM prepareLocalMemory 0 (kernelLocalMemory kstate))
            blankNameSource

  let (use_params, unpack_params) =
        unzip $ mapMaybe useAsParam $ kernelUses kernel

  -- The local_failure variable is an int despite only really storing
  -- a single bit of information, as some OpenCL implementations
  -- (e.g. AMD) does not like byte-sized local memory (and the others
  -- likely pad to a whole word anyway).
  let (safety, error_init)
        -- We conservatively assume that any called function can fail.
        | not $ null called =
            ( SafetyFull,
              [C.citems|volatile __local int local_failure;
                        // Harmless for all threads to write this.
                        local_failure = 0;|]
            )
        | length (kernelFailures kstate) == length failures =
            if kernelFailureTolerant kernel
              then (SafetyNone, [])
              else -- No possible failures in this kernel, so if we make
              -- it past an initial check, then we are good to go.

                ( SafetyCheap,
                  [C.citems|if (*global_failure >= 0) { return; }|]
                )
        | otherwise =
            if not (kernelHasBarriers kstate)
              then
                ( SafetyFull,
                  [C.citems|if (*global_failure >= 0) { return; }|]
                )
              else
                ( SafetyFull,
                  [C.citems|
                     volatile __local int local_failure;
                     if (failure_is_an_option) {
                       int failed = *global_failure >= 0;
                       if (failed) {
                         return;
                       }
                     }
                     // All threads write this value - it looks like CUDA has a compiler bug otherwise.
                     local_failure = 0;
                     barrier(CLK_LOCAL_MEM_FENCE);
                  |]
                )

      failure_params =
        [ [C.cparam|__global int *global_failure|],
          [C.cparam|int failure_is_an_option|],
          [C.cparam|__global typename int64_t *global_failure_args|]
        ]

      (local_memory_param, prepare_local_memory) =
        case target of
          TargetOpenCL ->
            ( [[C.cparam|__local typename uint64_t* local_mem_aligned|]],
              [C.citems|__local unsigned char* local_mem = local_mem_aligned;|]
            )
          TargetCUDA -> (mempty, mempty)
          TargetHIP -> (mempty, mempty)

      params =
        local_memory_param
          ++ take (numFailureParams safety) failure_params
          ++ local_memory_params
          ++ use_params

      attribute =
        case mapM isConst $ kernelGroupSize kernel of
          Just [x, y, z] ->
            "FUTHARK_KERNEL_SIZED" <> prettyText (x, y, z) <> "\n"
          Just [x, y] ->
            "FUTHARK_KERNEL_SIZED" <> prettyText (x, y, 1 :: Int) <> "\n"
          Just [x] ->
            "FUTHARK_KERNEL_SIZED" <> prettyText (x, 1 :: Int, 1 :: Int) <> "\n"
          _ -> "FUTHARK_KERNEL\n"

      kernel_fun =
        attribute
          <> funcText
            [C.cfun|void $id:name ($params:params) {
                    $items:(mconcat unpack_params)
                    $items:const_defs
                    $items:prepare_local_memory
                    $items:local_memory_init
                    $items:error_init
                    $items:kernel_body

                    $id:(errorLabel kstate): return;

                    $items:const_undefs
                }|]
  modify $ \s ->
    s
      { clGPU = M.insert name (safety, kernel_fun) $ clGPU s,
        clUsedTypes = typesInKernel kernel <> clUsedTypes s,
        clFailures = kernelFailures kstate
      }

  -- The error handling stuff is automatically added later.
  let args = local_memory_args ++ kernelArgs kernel

  pure $ LaunchKernel safety name local_memory_bytes args num_groups group_size
  where
    name = kernelName kernel
    num_groups = kernelNumGroups kernel
    group_size = kernelGroupSize kernel
    padTo8 e = e + ((8 - (e `rem` 8)) `rem` 8)

    prepareLocalMemory (Count offset) (mem, Count size) = do
      param <- newVName $ baseString mem ++ "_offset"
      let offset' = offset + padTo8 size
      pure
        ( bytes offset',
          ( [C.cparam|typename int64_t $id:param|],
            ValueKArg (untyped offset) $ IntType Int64,
            [C.citem|volatile __local $ty:defaultMemBlockType $id:mem = &local_mem[$id:param];|]
          )
        )

useAsParam :: KernelUse -> Maybe (C.Param, [C.BlockItem])
useAsParam (ScalarUse name pt) = do
  let name_bits = zEncodeText (prettyText name) <> "_bits"
      ctp = case pt of
        -- OpenCL does not permit bool as a kernel parameter type.
        Bool -> [C.cty|unsigned char|]
        Unit -> [C.cty|unsigned char|]
        _ -> primStorageType pt
  if ctp == primTypeToCType pt
    then Just ([C.cparam|$ty:ctp $id:name|], [])
    else
      let name_bits_e = [C.cexp|$id:name_bits|]
       in Just
            ( [C.cparam|$ty:ctp $id:name_bits|],
              [[C.citem|$ty:(primTypeToCType pt) $id:name = $exp:(fromStorage pt name_bits_e);|]]
            )
useAsParam (MemoryUse name) =
  Just ([C.cparam|__global $ty:defaultMemBlockType $id:name|], [])
useAsParam ConstUse {} =
  Nothing

-- Constants are #defined as macros.  Since a constant name in one
-- kernel might potentially (although unlikely) also be used for
-- something else in another kernel, we #undef them after the kernel.
constDef :: KernelUse -> Maybe (C.BlockItem, C.BlockItem)
constDef (ConstUse v e) =
  Just
    ( [C.citem|$escstm:(T.unpack def)|],
      [C.citem|$escstm:(T.unpack undef)|]
    )
  where
    e' = compilePrimExp e
    def = "#define " <> idText (C.toIdent v mempty) <> " (" <> expText e' <> ")"
    undef = "#undef " <> idText (C.toIdent v mempty)
constDef _ = Nothing

commonPrelude :: T.Text
commonPrelude =
  halfH
    <> cScalarDefs
    <> atomicsH
    <> transposeCL
    <> copyCL

genOpenClPrelude :: S.Set PrimType -> T.Text
genOpenClPrelude ts =
  "#define FUTHARK_OPENCL\n"
    <> enable_f64
    <> preludeCL
    <> commonPrelude
  where
    enable_f64
      | FloatType Float64 `S.member` ts =
          [untrimming|#define FUTHARK_F64_ENABLED|]
      | otherwise = mempty

genCUDAPrelude :: T.Text
genCUDAPrelude =
  "#define FUTHARK_CUDA\n"
    <> preludeCU
    <> commonPrelude

genHIPPrelude :: T.Text
genHIPPrelude =
  "#define FUTHARK_HIP\n"
    <> preludeCU
    <> commonPrelude

compilePrimExp :: PrimExp KernelConst -> C.Exp
compilePrimExp e = runIdentity $ GC.compilePrimExp compileKernelConst e
  where
    compileKernelConst (SizeConst key) =
      pure [C.cexp|$id:(zEncodeText (prettyText key))|]
    compileKernelConst (SizeMaxConst size_class) =
      pure [C.cexp|$id:("max_" <> prettyString size_class)|]

kernelArgs :: Kernel -> [KernelArg]
kernelArgs = mapMaybe useToArg . kernelUses
  where
    useToArg (MemoryUse mem) = Just $ MemKArg mem
    useToArg (ScalarUse v pt) = Just $ ValueKArg (LeafExp v pt) pt
    useToArg ConstUse {} = Nothing

nextErrorLabel :: GC.CompilerM KernelOp KernelState String
nextErrorLabel =
  errorLabel <$> GC.getUserState

incErrorLabel :: GC.CompilerM KernelOp KernelState ()
incErrorLabel =
  GC.modifyUserState $ \s -> s {kernelNextSync = kernelNextSync s + 1}

pendingError :: Bool -> GC.CompilerM KernelOp KernelState ()
pendingError b =
  GC.modifyUserState $ \s -> s {kernelSyncPending = b}

hasCommunication :: ImpGPU.KernelCode -> Bool
hasCommunication = any communicates
  where
    communicates ErrorSync {} = True
    communicates Barrier {} = True
    communicates _ = False

-- Whether we are generating code for a kernel or a device function.
-- This has minor effects, such as exactly how failures are
-- propagated.
data OpsMode = KernelMode | FunMode deriving (Eq)

inKernelOperations ::
  Env ->
  OpsMode ->
  ImpGPU.KernelCode ->
  GC.Operations KernelOp KernelState
inKernelOperations env mode body =
  GC.Operations
    { GC.opsCompiler = kernelOps,
      GC.opsMemoryType = kernelMemoryType,
      GC.opsWriteScalar = kernelWriteScalar,
      GC.opsReadScalar = kernelReadScalar,
      GC.opsAllocate = cannotAllocate,
      GC.opsDeallocate = cannotDeallocate,
      GC.opsCopy = copyInKernel,
      GC.opsCopies = mempty,
      GC.opsFatMemory = False,
      GC.opsError = errorInKernel,
      GC.opsCall = callInKernel,
      GC.opsCritical = mempty
    }
  where
    has_communication = hasCommunication body

    fence FenceLocal = [C.cexp|CLK_LOCAL_MEM_FENCE|]
    fence FenceGlobal = [C.cexp|CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE|]

    kernelOps :: GC.OpCompiler KernelOp KernelState
    kernelOps (GetGroupId v i) =
      GC.stm [C.cstm|$id:v = get_group_id($int:i);|]
    kernelOps (GetLocalId v i) =
      GC.stm [C.cstm|$id:v = get_local_id($int:i);|]
    kernelOps (GetLocalSize v i) =
      GC.stm [C.cstm|$id:v = get_local_size($int:i);|]
    kernelOps (GetLockstepWidth v) =
      GC.stm [C.cstm|$id:v = LOCKSTEP_WIDTH;|]
    kernelOps (Barrier f) = do
      GC.stm [C.cstm|barrier($exp:(fence f));|]
      GC.modifyUserState $ \s -> s {kernelHasBarriers = True}
    kernelOps (MemFence FenceLocal) =
      GC.stm [C.cstm|mem_fence_local();|]
    kernelOps (MemFence FenceGlobal) =
      GC.stm [C.cstm|mem_fence_global();|]
    kernelOps (LocalAlloc name size) = do
      name' <- newVName $ prettyString name ++ "_backing"
      GC.modifyUserState $ \s ->
        s {kernelLocalMemory = (name', size) : kernelLocalMemory s}
      GC.stm [C.cstm|$id:name = (__local unsigned char*) $id:name';|]
    kernelOps (ErrorSync f) = do
      label <- nextErrorLabel
      pending <- kernelSyncPending <$> GC.getUserState
      when pending $ do
        pendingError False
        GC.stm [C.cstm|$id:label: barrier($exp:(fence f));|]
        GC.stm [C.cstm|if (local_failure) { return; }|]
      GC.stm [C.cstm|barrier($exp:(fence f));|]
      GC.modifyUserState $ \s -> s {kernelHasBarriers = True}
      incErrorLabel
    kernelOps (Atomic space aop) = atomicOps space aop

    atomicCast s t = do
      let volatile = [C.ctyquals|volatile|]
      let quals = case s of
            Space sid -> pointerQuals sid
            _ -> pointerQuals "global"
      pure [C.cty|$tyquals:(volatile++quals) $ty:t|]

    atomicSpace (Space sid) = sid
    atomicSpace _ = "global"

    doAtomic s t old arr ind val op ty = do
      ind' <- GC.compileExp $ untyped $ unCount ind
      val' <- GC.compileExp val
      cast <- atomicCast s ty
      GC.stm [C.cstm|$id:old = $id:op'(&(($ty:cast *)$id:arr)[$exp:ind'], ($ty:ty) $exp:val');|]
      where
        op' = op ++ "_" ++ prettyString t ++ "_" ++ atomicSpace s

    doAtomicCmpXchg s t old arr ind cmp val ty = do
      ind' <- GC.compileExp $ untyped $ unCount ind
      cmp' <- GC.compileExp cmp
      val' <- GC.compileExp val
      cast <- atomicCast s ty
      GC.stm [C.cstm|$id:old = $id:op(&(($ty:cast *)$id:arr)[$exp:ind'], $exp:cmp', $exp:val');|]
      where
        op = "atomic_cmpxchg_" ++ prettyString t ++ "_" ++ atomicSpace s
    doAtomicXchg s t old arr ind val ty = do
      cast <- atomicCast s ty
      ind' <- GC.compileExp $ untyped $ unCount ind
      val' <- GC.compileExp val
      GC.stm [C.cstm|$id:old = $id:op(&(($ty:cast *)$id:arr)[$exp:ind'], $exp:val');|]
      where
        op = "atomic_chg_" ++ prettyString t ++ "_" ++ atomicSpace s
    -- First the 64-bit operations.
    atomicOps s (AtomicAdd Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_add" [C.cty|typename int64_t|]
    atomicOps s (AtomicFAdd Float64 old arr ind val) =
      doAtomic s Float64 old arr ind val "atomic_fadd" [C.cty|double|]
    atomicOps s (AtomicSMax Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_smax" [C.cty|typename int64_t|]
    atomicOps s (AtomicSMin Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_smin" [C.cty|typename int64_t|]
    atomicOps s (AtomicUMax Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_umax" [C.cty|unsigned int64_t|]
    atomicOps s (AtomicUMin Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_umin" [C.cty|unsigned int64_t|]
    atomicOps s (AtomicAnd Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_and" [C.cty|typename int64_t|]
    atomicOps s (AtomicOr Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_or" [C.cty|typename int64_t|]
    atomicOps s (AtomicXor Int64 old arr ind val) =
      doAtomic s Int64 old arr ind val "atomic_xor" [C.cty|typename int64_t|]
    atomicOps s (AtomicCmpXchg (IntType Int64) old arr ind cmp val) =
      doAtomicCmpXchg s (IntType Int64) old arr ind cmp val [C.cty|typename int64_t|]
    atomicOps s (AtomicXchg (IntType Int64) old arr ind val) =
      doAtomicXchg s (IntType Int64) old arr ind val [C.cty|typename int64_t|]
    --
    atomicOps s (AtomicAdd t old arr ind val) =
      doAtomic s t old arr ind val "atomic_add" [C.cty|int|]
    atomicOps s (AtomicFAdd t old arr ind val) =
      doAtomic s t old arr ind val "atomic_fadd" [C.cty|float|]
    atomicOps s (AtomicSMax t old arr ind val) =
      doAtomic s t old arr ind val "atomic_smax" [C.cty|int|]
    atomicOps s (AtomicSMin t old arr ind val) =
      doAtomic s t old arr ind val "atomic_smin" [C.cty|int|]
    atomicOps s (AtomicUMax t old arr ind val) =
      doAtomic s t old arr ind val "atomic_umax" [C.cty|unsigned int|]
    atomicOps s (AtomicUMin t old arr ind val) =
      doAtomic s t old arr ind val "atomic_umin" [C.cty|unsigned int|]
    atomicOps s (AtomicAnd t old arr ind val) =
      doAtomic s t old arr ind val "atomic_and" [C.cty|int|]
    atomicOps s (AtomicOr t old arr ind val) =
      doAtomic s t old arr ind val "atomic_or" [C.cty|int|]
    atomicOps s (AtomicXor t old arr ind val) =
      doAtomic s t old arr ind val "atomic_xor" [C.cty|int|]
    atomicOps s (AtomicCmpXchg t old arr ind cmp val) =
      doAtomicCmpXchg s t old arr ind cmp val [C.cty|int|]
    atomicOps s (AtomicXchg t old arr ind val) =
      doAtomicXchg s t old arr ind val [C.cty|int|]

    cannotAllocate :: GC.Allocate KernelOp KernelState
    cannotAllocate _ =
      error "Cannot allocate memory in kernel"

    cannotDeallocate :: GC.Deallocate KernelOp KernelState
    cannotDeallocate _ _ =
      error "Cannot deallocate memory in kernel"

    copyInKernel :: GC.Copy KernelOp KernelState
    copyInKernel _ _ _ _ _ _ _ _ =
      error "Cannot bulk copy in kernel."

    kernelMemoryType space =
      pure [C.cty|$tyquals:(pointerQuals space) $ty:defaultMemBlockType|]

    kernelWriteScalar =
      GC.writeScalarPointerWithQuals pointerQuals

    kernelReadScalar =
      GC.readScalarPointerWithQuals pointerQuals

    whatNext = do
      label <- nextErrorLabel
      pendingError True
      pure $
        if has_communication
          then [C.citems|local_failure = 1; goto $id:label;|]
          else
            if mode == FunMode
              then [C.citems|return 1;|]
              else [C.citems|return;|]

    callInKernel dests fname args
      | functionMayFail fname env = do
          let out_args = [[C.cexp|&$id:d|] | d <- dests]
              args' =
                [C.cexp|global_failure|]
                  : [C.cexp|global_failure_args|]
                  : out_args
                  ++ args

          what_next <- whatNext
          GC.item [C.citem|if ($id:(funName fname)($args:args') != 0) { $items:what_next; }|]
      | otherwise = do
          let out_args = [[C.cexp|&$id:d|] | d <- dests]
              args' = out_args ++ args
          GC.item [C.citem|$id:(funName fname)($args:args');|]

    errorInKernel msg@(ErrorMsg parts) backtrace = do
      n <- length . kernelFailures <$> GC.getUserState
      GC.modifyUserState $ \s ->
        s {kernelFailures = kernelFailures s ++ [FailureMsg msg backtrace]}
      let setArgs _ [] = pure []
          setArgs i (ErrorString {} : parts') = setArgs i parts'
          -- FIXME: bogus for non-ints.
          setArgs i (ErrorVal _ x : parts') = do
            x' <- GC.compileExp x
            stms <- setArgs (i + 1) parts'
            pure $ [C.cstm|global_failure_args[$int:i] = (typename int64_t)$exp:x';|] : stms
      argstms <- setArgs (0 :: Int) parts

      what_next <- whatNext

      GC.stm
        [C.cstm|{ if (atomic_cmpxchg_i32_global(global_failure, -1, $int:n) == -1)
                                 { $stms:argstms; }
                                 $items:what_next
                               }|]

--- Checking requirements

typesInKernel :: Kernel -> S.Set PrimType
typesInKernel kernel = typesInCode $ kernelBody kernel

typesInCode :: ImpGPU.KernelCode -> S.Set PrimType
typesInCode Skip = mempty
typesInCode (c1 :>>: c2) = typesInCode c1 <> typesInCode c2
typesInCode (For _ e c) = typesInExp e <> typesInCode c
typesInCode (While (TPrimExp e) c) = typesInExp e <> typesInCode c
typesInCode DeclareMem {} = mempty
typesInCode (DeclareScalar _ _ t) = S.singleton t
typesInCode (DeclareArray _ t _) = S.singleton t
typesInCode (Allocate _ (Count (TPrimExp e)) _) = typesInExp e
typesInCode Free {} = mempty
typesInCode (LMADCopy _ shape _ (Count (TPrimExp dstoffset), dststrides) _ (Count (TPrimExp srcoffset), srcstrides)) =
  foldMap (typesInExp . untyped . unCount) shape
    <> typesInExp dstoffset
    <> foldMap (typesInExp . untyped . unCount) dststrides
    <> typesInExp srcoffset
    <> foldMap (typesInExp . untyped . unCount) srcstrides
typesInCode (Write _ (Count (TPrimExp e1)) t _ _ e2) =
  typesInExp e1 <> S.singleton t <> typesInExp e2
typesInCode (Read _ _ (Count (TPrimExp e1)) t _ _) =
  typesInExp e1 <> S.singleton t
typesInCode (SetScalar _ e) = typesInExp e
typesInCode SetMem {} = mempty
typesInCode (Call _ _ es) = mconcat $ map typesInArg es
  where
    typesInArg MemArg {} = mempty
    typesInArg (ExpArg e) = typesInExp e
typesInCode (If (TPrimExp e) c1 c2) =
  typesInExp e <> typesInCode c1 <> typesInCode c2
typesInCode (Assert e _ _) = typesInExp e
typesInCode (Comment _ c) = typesInCode c
typesInCode (DebugPrint _ v) = maybe mempty typesInExp v
typesInCode (TracePrint msg) = foldMap typesInExp msg
typesInCode Op {} = mempty

typesInExp :: Exp -> S.Set PrimType
typesInExp (ValueExp v) = S.singleton $ primValueType v
typesInExp (BinOpExp _ e1 e2) = typesInExp e1 <> typesInExp e2
typesInExp (CmpOpExp _ e1 e2) = typesInExp e1 <> typesInExp e2
typesInExp (ConvOpExp op e) = S.fromList [from, to] <> typesInExp e
  where
    (from, to) = convOpType op
typesInExp (UnOpExp _ e) = typesInExp e
typesInExp (FunExp _ args t) = S.singleton t <> mconcat (map typesInExp args)
typesInExp LeafExp {} = mempty