{-# LANGUAGE TypeFamilies #-}

-- | Code generation for segmented and non-segmented scans.  Uses a
-- fast single-pass algorithm, but which only works on NVIDIA GPUs and
-- with some constraints on the operator.  We use this when we can.
module Futhark.CodeGen.ImpGen.GPU.SegScan.SinglePass (compileSegScan) where

import Control.Monad
import Data.List (zip4, zip7)
import Data.Map qualified as M
import Data.Maybe
import Futhark.CodeGen.ImpCode.GPU qualified as Imp
import Futhark.CodeGen.ImpGen
import Futhark.CodeGen.ImpGen.GPU.Base
import Futhark.IR.GPUMem
import Futhark.IR.Mem.LMAD qualified as LMAD
import Futhark.Transform.Rename
import Futhark.Util (mapAccumLM, takeLast)
import Futhark.Util.IntegralExp (IntegralExp (mod, rem), divUp, nextMul, quot)
import Prelude hiding (mod, quot, rem)

xParams, yParams :: SegBinOp GPUMem -> [LParam GPUMem]
xParams :: SegBinOp GPUMem -> [LParam GPUMem]
xParams SegBinOp GPUMem
scan =
  Int
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. Int -> [a] -> [a]
take ([SubExp] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length (SegBinOp GPUMem -> [SubExp]
forall rep. SegBinOp rep -> [SubExp]
segBinOpNeutral SegBinOp GPUMem
scan)) (Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams (SegBinOp GPUMem -> Lambda GPUMem
forall rep. SegBinOp rep -> Lambda rep
segBinOpLambda SegBinOp GPUMem
scan))
yParams :: SegBinOp GPUMem -> [LParam GPUMem]
yParams SegBinOp GPUMem
scan =
  Int
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. Int -> [a] -> [a]
drop ([SubExp] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length (SegBinOp GPUMem -> [SubExp]
forall rep. SegBinOp rep -> [SubExp]
segBinOpNeutral SegBinOp GPUMem
scan)) (Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams (SegBinOp GPUMem -> Lambda GPUMem
forall rep. SegBinOp rep -> Lambda rep
segBinOpLambda SegBinOp GPUMem
scan))

createLocalArrays ::
  Count BlockSize SubExp ->
  SubExp ->
  [PrimType] ->
  InKernelGen (VName, [VName], [VName], VName, [VName])
createLocalArrays :: Count BlockSize SubExp
-> SubExp
-> [PrimType]
-> InKernelGen (VName, [VName], [VName], VName, [VName])
createLocalArrays (Count SubExp
block_size) SubExp
chunk [PrimType]
types = do
  let block_sizeE :: TExp Int64
block_sizeE = SubExp -> TExp Int64
pe64 SubExp
block_size
      workSize :: TExp Int64
workSize = SubExp -> TExp Int64
pe64 SubExp
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
block_sizeE
      prefixArraysSize :: TExp Int64
prefixArraysSize =
        (TExp Int64 -> TExp Int64 -> TExp Int64)
-> TExp Int64 -> [TExp Int64] -> TExp Int64
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl (\TExp Int64
acc TExp Int64
tySize -> TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
nextMul TExp Int64
acc TExp Int64
tySize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
tySize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
block_sizeE) TExp Int64
0 ([TExp Int64] -> TExp Int64) -> [TExp Int64] -> TExp Int64
forall a b. (a -> b) -> a -> b
$
          (PrimType -> TExp Int64) -> [PrimType] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize [PrimType]
types
      maxTransposedArraySize :: TExp Int64
maxTransposedArraySize =
        (TExp Int64 -> TExp Int64 -> TExp Int64)
-> [TExp Int64] -> TExp Int64
forall a. (a -> a -> a) -> [a] -> a
forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1 TExp Int64 -> TExp Int64 -> TExp Int64
forall v. TPrimExp Int64 v -> TPrimExp Int64 v -> TPrimExp Int64 v
sMax64 ([TExp Int64] -> TExp Int64) -> [TExp Int64] -> TExp Int64
forall a b. (a -> b) -> a -> b
$ (PrimType -> TExp Int64) -> [PrimType] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map (\PrimType
ty -> TExp Int64
workSize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize PrimType
ty) [PrimType]
types

      warp_size :: (Num a) => a
      warp_size :: forall a. Num a => a
warp_size = a
32
      maxWarpExchangeSize :: TExp Int64
maxWarpExchangeSize =
        (TExp Int64 -> TExp Int64 -> TExp Int64)
-> TExp Int64 -> [TExp Int64] -> TExp Int64
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl (\TExp Int64
acc TExp Int64
tySize -> TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
nextMul TExp Int64
acc TExp Int64
tySize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
tySize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* Integer -> TExp Int64
forall a. Num a => Integer -> a
fromInteger Integer
forall a. Num a => a
warp_size) TExp Int64
0 ([TExp Int64] -> TExp Int64) -> [TExp Int64] -> TExp Int64
forall a b. (a -> b) -> a -> b
$
          (PrimType -> TExp Int64) -> [PrimType] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize [PrimType]
types
      maxLookbackSize :: TExp Int64
maxLookbackSize = TExp Int64
maxWarpExchangeSize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
forall a. Num a => a
warp_size
      size :: Count Bytes (TExp Int64)
size = TExp Int64 -> Count Bytes (TExp Int64)
forall a. a -> Count Bytes a
Imp.bytes (TExp Int64 -> Count Bytes (TExp Int64))
-> TExp Int64 -> Count Bytes (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
maxLookbackSize TExp Int64 -> TExp Int64 -> TExp Int64
forall v. TPrimExp Int64 v -> TPrimExp Int64 v -> TPrimExp Int64 v
`sMax64` TExp Int64
prefixArraysSize TExp Int64 -> TExp Int64 -> TExp Int64
forall v. TPrimExp Int64 v -> TPrimExp Int64 v -> TPrimExp Int64 v
`sMax64` TExp Int64
maxTransposedArraySize

  (TExp Int64
_, [TExp Int64]
byteOffsets) <-
    (TExp Int64
 -> TExp Int64
 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64, TExp Int64))
-> TExp Int64
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp (TExp Int64, [TExp Int64])
forall (m :: * -> *) (t :: * -> *) acc x y.
(Monad m, Traversable t) =>
(acc -> x -> m (acc, y)) -> acc -> t x -> m (acc, t y)
mapAccumLM
      ( \TExp Int64
off TExp Int64
tySize -> do
          TExp Int64
off' <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"byte_offsets" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
nextMul TExp Int64
off TExp Int64
tySize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ SubExp -> TExp Int64
pe64 SubExp
block_size TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tySize
          (TExp Int64, TExp Int64)
-> ImpM GPUMem KernelEnv KernelOp (TExp Int64, TExp Int64)
forall a. a -> ImpM GPUMem KernelEnv KernelOp a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (TExp Int64
off', TExp Int64
off)
      )
      TExp Int64
0
      ([TExp Int64]
 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64, [TExp Int64]))
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp (TExp Int64, [TExp Int64])
forall a b. (a -> b) -> a -> b
$ (PrimType -> TExp Int64) -> [PrimType] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize [PrimType]
types

  (TExp Int64
_, [TExp Int64]
warpByteOffsets) <-
    (TExp Int64
 -> TExp Int64
 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64, TExp Int64))
-> TExp Int64
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp (TExp Int64, [TExp Int64])
forall (m :: * -> *) (t :: * -> *) acc x y.
(Monad m, Traversable t) =>
(acc -> x -> m (acc, y)) -> acc -> t x -> m (acc, t y)
mapAccumLM
      ( \TExp Int64
off TExp Int64
tySize -> do
          TExp Int64
off' <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"warp_byte_offset" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
nextMul TExp Int64
off TExp Int64
tySize TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
forall a. Num a => a
warp_size TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tySize
          (TExp Int64, TExp Int64)
-> ImpM GPUMem KernelEnv KernelOp (TExp Int64, TExp Int64)
forall a. a -> ImpM GPUMem KernelEnv KernelOp a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (TExp Int64
off', TExp Int64
off)
      )
      TExp Int64
forall a. Num a => a
warp_size
      ([TExp Int64]
 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64, [TExp Int64]))
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp (TExp Int64, [TExp Int64])
forall a b. (a -> b) -> a -> b
$ (PrimType -> TExp Int64) -> [PrimType] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize [PrimType]
types

  Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Allocate reusable shared memory" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ () -> ImpM GPUMem KernelEnv KernelOp ()
forall a. a -> ImpM GPUMem KernelEnv KernelOp a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()

  VName
localMem <- String
-> Count Bytes (TExp Int64)
-> Space
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String -> Count Bytes (TExp Int64) -> Space -> ImpM rep r op VName
sAlloc String
"local_mem" Count Bytes (TExp Int64)
size (String -> Space
Space String
"shared")
  TV Int64
transposeArrayLength <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"trans_arr_len" TExp Int64
workSize

  VName
sharedId <- String
-> PrimType
-> ShapeBase SubExp
-> VName
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> VName -> ImpM rep r op VName
sArrayInMem String
"shared_id" PrimType
int32 ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [Int32 -> SubExp
forall v. IsValue v => v -> SubExp
constant (Int32
1 :: Int32)]) VName
localMem

  [VName]
transposedArrays <-
    [PrimType]
-> (PrimType -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [PrimType]
types ((PrimType -> ImpM GPUMem KernelEnv KernelOp VName)
 -> ImpM GPUMem KernelEnv KernelOp [VName])
-> (PrimType -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall a b. (a -> b) -> a -> b
$ \PrimType
ty ->
      String
-> PrimType
-> ShapeBase SubExp
-> VName
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> VName -> ImpM rep r op VName
sArrayInMem
        String
"local_transpose_arr"
        PrimType
ty
        ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [TV Int64 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Int64
transposeArrayLength])
        VName
localMem

  [VName]
prefixArrays <-
    [(TExp Int64, PrimType)]
-> ((TExp Int64, PrimType) -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM ([TExp Int64] -> [PrimType] -> [(TExp Int64, PrimType)]
forall a b. [a] -> [b] -> [(a, b)]
zip [TExp Int64]
byteOffsets [PrimType]
types) (((TExp Int64, PrimType) -> ImpM GPUMem KernelEnv KernelOp VName)
 -> ImpM GPUMem KernelEnv KernelOp [VName])
-> ((TExp Int64, PrimType) -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall a b. (a -> b) -> a -> b
$ \(TExp Int64
off, PrimType
ty) -> do
      let off' :: TExp Int64
off' = TExp Int64
off TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
`quot` PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize PrimType
ty
      String
-> PrimType
-> ShapeBase SubExp
-> VName
-> LMAD
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String
-> PrimType
-> ShapeBase SubExp
-> VName
-> LMAD
-> ImpM rep r op VName
sArray
        String
"local_prefix_arr"
        PrimType
ty
        ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [SubExp
block_size])
        VName
localMem
        (LMAD -> ImpM GPUMem KernelEnv KernelOp VName)
-> LMAD -> ImpM GPUMem KernelEnv KernelOp VName
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> [TExp Int64] -> LMAD
forall num. IntegralExp num => num -> [num] -> LMAD num
LMAD.iota TExp Int64
off' [SubExp -> TExp Int64
pe64 SubExp
block_size]

  VName
warpscan <- String
-> PrimType
-> ShapeBase SubExp
-> VName
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> VName -> ImpM rep r op VName
sArrayInMem String
"warpscan" PrimType
int8 ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [Int64 -> SubExp
forall v. IsValue v => v -> SubExp
constant (Int64
forall a. Num a => a
warp_size :: Int64)]) VName
localMem
  [VName]
warpExchanges <-
    [(TExp Int64, PrimType)]
-> ((TExp Int64, PrimType) -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM ([TExp Int64] -> [PrimType] -> [(TExp Int64, PrimType)]
forall a b. [a] -> [b] -> [(a, b)]
zip [TExp Int64]
warpByteOffsets [PrimType]
types) (((TExp Int64, PrimType) -> ImpM GPUMem KernelEnv KernelOp VName)
 -> ImpM GPUMem KernelEnv KernelOp [VName])
-> ((TExp Int64, PrimType) -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall a b. (a -> b) -> a -> b
$ \(TExp Int64
off, PrimType
ty) -> do
      let off' :: TExp Int64
off' = TExp Int64
off TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
`quot` PrimType -> TExp Int64
forall a. Num a => PrimType -> a
primByteSize PrimType
ty
      String
-> PrimType
-> ShapeBase SubExp
-> VName
-> LMAD
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String
-> PrimType
-> ShapeBase SubExp
-> VName
-> LMAD
-> ImpM rep r op VName
sArray
        String
"warp_exchange"
        PrimType
ty
        ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [Int64 -> SubExp
forall v. IsValue v => v -> SubExp
constant (Int64
forall a. Num a => a
warp_size :: Int64)])
        VName
localMem
        (LMAD -> ImpM GPUMem KernelEnv KernelOp VName)
-> LMAD -> ImpM GPUMem KernelEnv KernelOp VName
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> [TExp Int64] -> LMAD
forall num. IntegralExp num => num -> [num] -> LMAD num
LMAD.iota TExp Int64
off' [TExp Int64
forall a. Num a => a
warp_size]

  (VName, [VName], [VName], VName, [VName])
-> InKernelGen (VName, [VName], [VName], VName, [VName])
forall a. a -> ImpM GPUMem KernelEnv KernelOp a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (VName
sharedId, [VName]
transposedArrays, [VName]
prefixArrays, VName
warpscan, [VName]
warpExchanges)

statusX, statusA, statusP :: (Num a) => a
statusX :: forall a. Num a => a
statusX = a
0
statusA :: forall a. Num a => a
statusA = a
1
statusP :: forall a. Num a => a
statusP = a
2

inBlockScanLookback ::
  KernelConstants ->
  Imp.TExp Int64 ->
  VName ->
  [VName] ->
  Lambda GPUMem ->
  InKernelGen ()
inBlockScanLookback :: KernelConstants
-> TExp Int64
-> VName
-> [VName]
-> Lambda GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
inBlockScanLookback KernelConstants
constants TExp Int64
arrs_full_size VName
flag_arr [VName]
arrs Lambda GPUMem
scan_lam = ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
  TV Any
flg_x <- String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Any)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"flg_x" PrimType
p_int8
  TV Int8
flg_y <- String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Int8)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"flg_y" PrimType
p_int8
  let flg_param_x :: Param (MemInfo SubExp NoUniqueness MemBind)
flg_param_x = Attrs
-> VName
-> MemInfo SubExp NoUniqueness MemBind
-> Param (MemInfo SubExp NoUniqueness MemBind)
forall dec. Attrs -> VName -> dec -> Param dec
Param Attrs
forall a. Monoid a => a
mempty (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
flg_x) (PrimType -> MemInfo SubExp NoUniqueness MemBind
forall d u ret. PrimType -> MemInfo d u ret
MemPrim PrimType
p_int8)
      flg_param_y :: Param (MemInfo SubExp NoUniqueness MemBind)
flg_param_y = Attrs
-> VName
-> MemInfo SubExp NoUniqueness MemBind
-> Param (MemInfo SubExp NoUniqueness MemBind)
forall dec. Attrs -> VName -> dec -> Param dec
Param Attrs
forall a. Monoid a => a
mempty (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
flg_y) (PrimType -> MemInfo SubExp NoUniqueness MemBind
forall d u ret. PrimType -> MemInfo d u ret
MemPrim PrimType
p_int8)
      flg_y_exp :: TPrimExp Int8 VName
flg_y_exp = TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flg_y
      statusP_e :: TPrimExp Int8 VName
statusP_e = TPrimExp Int8 VName
forall a. Num a => a
statusP :: Imp.TExp Int8
      statusX_e :: TPrimExp Int8 VName
statusX_e = TPrimExp Int8 VName
forall a. Num a => a
statusX :: Imp.TExp Int8

  [LParam GPUMem] -> ImpM GPUMem KernelEnv KernelOp ()
forall rep (inner :: * -> *) r op.
Mem rep inner =>
[LParam rep] -> ImpM rep r op ()
dLParams (Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
scan_lam)

  TV Int32
skip_threads <- String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Int32)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"skip_threads" PrimType
int32
  let in_block_thread_active :: TPrimExp Bool VName
in_block_thread_active =
        TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
skip_threads TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<=. TExp Int32
in_block_id
      actual_params :: [LParam GPUMem]
actual_params = Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
scan_lam
      ([Param (MemInfo SubExp NoUniqueness MemBind)]
x_params, [Param (MemInfo SubExp NoUniqueness MemBind)]
y_params) =
        Int
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> ([Param (MemInfo SubExp NoUniqueness MemBind)],
    [Param (MemInfo SubExp NoUniqueness MemBind)])
forall a. Int -> [a] -> ([a], [a])
splitAt ([Param (MemInfo SubExp NoUniqueness MemBind)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [LParam GPUMem]
[Param (MemInfo SubExp NoUniqueness MemBind)]
actual_params Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
2) [LParam GPUMem]
[Param (MemInfo SubExp NoUniqueness MemBind)]
actual_params
      y_to_x :: ImpM GPUMem KernelEnv KernelOp ()
y_to_x =
        [(Param (MemInfo SubExp NoUniqueness MemBind),
  Param (MemInfo SubExp NoUniqueness MemBind))]
-> ((Param (MemInfo SubExp NoUniqueness MemBind),
     Param (MemInfo SubExp NoUniqueness MemBind))
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [(Param (MemInfo SubExp NoUniqueness MemBind),
     Param (MemInfo SubExp NoUniqueness MemBind))]
forall a b. [a] -> [b] -> [(a, b)]
zip [Param (MemInfo SubExp NoUniqueness MemBind)]
x_params [Param (MemInfo SubExp NoUniqueness MemBind)]
y_params) (((Param (MemInfo SubExp NoUniqueness MemBind),
   Param (MemInfo SubExp NoUniqueness MemBind))
  -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((Param (MemInfo SubExp NoUniqueness MemBind),
     Param (MemInfo SubExp NoUniqueness MemBind))
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(Param (MemInfo SubExp NoUniqueness MemBind)
x, Param (MemInfo SubExp NoUniqueness MemBind)
y) ->
          Bool
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (TypeBase (ShapeBase SubExp) NoUniqueness -> Bool
forall shape u. TypeBase shape u -> Bool
primType (Param (MemInfo SubExp NoUniqueness MemBind)
-> TypeBase (ShapeBase SubExp) NoUniqueness
forall dec.
Typed dec =>
Param dec -> TypeBase (ShapeBase SubExp) NoUniqueness
paramType Param (MemInfo SubExp NoUniqueness MemBind)
x)) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
            VName
-> [DimIndex (TExp Int64)]
-> SubExp
-> [DimIndex (TExp Int64)]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName
-> [DimIndex (TExp Int64)]
-> SubExp
-> [DimIndex (TExp Int64)]
-> ImpM rep r op ()
copyDWIM (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
x) [] (VName -> SubExp
Var (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
y)) []
      y_to_x_flg :: ImpM GPUMem KernelEnv KernelOp ()
y_to_x_flg =
        VName
-> [DimIndex (TExp Int64)]
-> SubExp
-> [DimIndex (TExp Int64)]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName
-> [DimIndex (TExp Int64)]
-> SubExp
-> [DimIndex (TExp Int64)]
-> ImpM rep r op ()
copyDWIM (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
flg_x) [] (VName -> SubExp
Var (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
flg_y)) []

  -- Set initial y values
  Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"read input for in-block scan" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
    (Param (MemInfo SubExp NoUniqueness MemBind)
 -> VName -> ImpM GPUMem KernelEnv KernelOp ())
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [VName]
-> ImpM GPUMem KernelEnv KernelOp ()
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ Param (MemInfo SubExp NoUniqueness MemBind)
-> VName -> ImpM GPUMem KernelEnv KernelOp ()
readInitial (Param (MemInfo SubExp NoUniqueness MemBind)
flg_param_y Param (MemInfo SubExp NoUniqueness MemBind)
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. a -> [a] -> [a]
: [Param (MemInfo SubExp NoUniqueness MemBind)]
y_params) (VName
flag_arr VName -> [VName] -> [VName]
forall a. a -> [a] -> [a]
: [VName]
arrs)
    -- Since the final result is expected to be in x_params, we may
    -- need to copy it there for the first thread in the block.
    TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int32
in_block_id TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
      ImpM GPUMem KernelEnv KernelOp ()
y_to_x
      ImpM GPUMem KernelEnv KernelOp ()
y_to_x_flg

  Bool
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
array_scan ImpM GPUMem KernelEnv KernelOp ()
barrier

  let op_to_x :: ImpM GPUMem KernelEnv KernelOp ()
op_to_x = do
        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
          (TPrimExp Int8 VName
flg_y_exp TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
statusP_e TPrimExp Bool VName -> TPrimExp Bool VName -> TPrimExp Bool VName
forall v.
Eq v =>
TPrimExp Bool v -> TPrimExp Bool v -> TPrimExp Bool v
.||. TPrimExp Int8 VName
flg_y_exp TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
statusX_e)
          ( do
              ImpM GPUMem KernelEnv KernelOp ()
y_to_x_flg
              ImpM GPUMem KernelEnv KernelOp ()
y_to_x
          )
          ([Param (MemInfo SubExp NoUniqueness MemBind)]
-> Body GPUMem -> ImpM GPUMem KernelEnv KernelOp ()
forall dec rep r op. [Param dec] -> Body rep -> ImpM rep r op ()
compileBody' [Param (MemInfo SubExp NoUniqueness MemBind)]
x_params (Body GPUMem -> ImpM GPUMem KernelEnv KernelOp ())
-> Body GPUMem -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_lam)

  Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"in-block scan (hopefully no barriers needed)" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
    TV Int32
skip_threads TV Int32 -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- TExp Int32
1

    TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhile (TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
skip_threads TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TExp Int32
block_size) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
      TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen TPrimExp Bool VName
in_block_thread_active (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"read operands" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
          (Param (MemInfo SubExp NoUniqueness MemBind)
 -> VName -> ImpM GPUMem KernelEnv KernelOp ())
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [VName]
-> ImpM GPUMem KernelEnv KernelOp ()
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_
            (TExp Int64
-> Param (MemInfo SubExp NoUniqueness MemBind)
-> VName
-> ImpM GPUMem KernelEnv KernelOp ()
readParam (TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int32 -> TExp Int64) -> TExp Int32 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
skip_threads))
            (Param (MemInfo SubExp NoUniqueness MemBind)
flg_param_x Param (MemInfo SubExp NoUniqueness MemBind)
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. a -> [a] -> [a]
: [Param (MemInfo SubExp NoUniqueness MemBind)]
x_params)
            (VName
flag_arr VName -> [VName] -> [VName]
forall a. a -> [a] -> [a]
: [VName]
arrs)
        Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"perform operation" ImpM GPUMem KernelEnv KernelOp ()
op_to_x

        Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"write result" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
          [ImpM GPUMem KernelEnv KernelOp ()]
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, Monad m) =>
t (m a) -> m ()
sequence_ ([ImpM GPUMem KernelEnv KernelOp ()]
 -> ImpM GPUMem KernelEnv KernelOp ())
-> [ImpM GPUMem KernelEnv KernelOp ()]
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
            (Param (MemInfo SubExp NoUniqueness MemBind)
 -> Param (MemInfo SubExp NoUniqueness MemBind)
 -> VName
 -> ImpM GPUMem KernelEnv KernelOp ())
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [VName]
-> [ImpM GPUMem KernelEnv KernelOp ()]
forall a b c d. (a -> b -> c -> d) -> [a] -> [b] -> [c] -> [d]
zipWith3
              Param (MemInfo SubExp NoUniqueness MemBind)
-> Param (MemInfo SubExp NoUniqueness MemBind)
-> VName
-> ImpM GPUMem KernelEnv KernelOp ()
writeResult
              (Param (MemInfo SubExp NoUniqueness MemBind)
flg_param_x Param (MemInfo SubExp NoUniqueness MemBind)
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. a -> [a] -> [a]
: [Param (MemInfo SubExp NoUniqueness MemBind)]
x_params)
              (Param (MemInfo SubExp NoUniqueness MemBind)
flg_param_y Param (MemInfo SubExp NoUniqueness MemBind)
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. a -> [a] -> [a]
: [Param (MemInfo SubExp NoUniqueness MemBind)]
y_params)
              (VName
flag_arr VName -> [VName] -> [VName]
forall a. a -> [a] -> [a]
: [VName]
arrs)

      TV Int32
skip_threads TV Int32 -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
skip_threads TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
2
  where
    p_int8 :: PrimType
p_int8 = IntType -> PrimType
IntType IntType
Int8
    block_size :: TExp Int32
block_size = TExp Int32
32
    block_id :: TExp Int32
block_id = TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TExp Int32
forall e. IntegralExp e => e -> e -> e
`quot` TExp Int32
block_size
    in_block_id :: TExp Int32
in_block_id = TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
block_id TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
block_size
    ltid32 :: TExp Int32
ltid32 = KernelConstants -> TExp Int32
kernelLocalThreadId KernelConstants
constants
    ltid :: TExp Int64
ltid = TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int32
ltid32
    gtid :: TExp Int64
gtid = TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int32 -> TExp Int64) -> TExp Int32 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ KernelConstants -> TExp Int32
kernelGlobalThreadId KernelConstants
constants
    array_scan :: Bool
array_scan = Bool -> Bool
not (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ (TypeBase (ShapeBase SubExp) NoUniqueness -> Bool)
-> [TypeBase (ShapeBase SubExp) NoUniqueness] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all TypeBase (ShapeBase SubExp) NoUniqueness -> Bool
forall shape u. TypeBase shape u -> Bool
primType ([TypeBase (ShapeBase SubExp) NoUniqueness] -> Bool)
-> [TypeBase (ShapeBase SubExp) NoUniqueness] -> Bool
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> [TypeBase (ShapeBase SubExp) NoUniqueness]
forall rep.
Lambda rep -> [TypeBase (ShapeBase SubExp) NoUniqueness]
lambdaReturnType Lambda GPUMem
scan_lam
    barrier :: ImpM GPUMem KernelEnv KernelOp ()
barrier
      | Bool
array_scan =
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp (KernelOp -> ImpM GPUMem KernelEnv KernelOp ())
-> KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ Fence -> KernelOp
Imp.Barrier Fence
Imp.FenceGlobal
      | Bool
otherwise =
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp (KernelOp -> ImpM GPUMem KernelEnv KernelOp ())
-> KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ Fence -> KernelOp
Imp.Barrier Fence
Imp.FenceLocal

    readInitial :: Param (MemInfo SubExp NoUniqueness MemBind)
-> VName -> ImpM GPUMem KernelEnv KernelOp ()
readInitial Param (MemInfo SubExp NoUniqueness MemBind)
p VName
arr
      | TypeBase (ShapeBase SubExp) NoUniqueness -> Bool
forall shape u. TypeBase shape u -> Bool
primType (TypeBase (ShapeBase SubExp) NoUniqueness -> Bool)
-> TypeBase (ShapeBase SubExp) NoUniqueness -> Bool
forall a b. (a -> b) -> a -> b
$ Param (MemInfo SubExp NoUniqueness MemBind)
-> TypeBase (ShapeBase SubExp) NoUniqueness
forall dec.
Typed dec =>
Param dec -> TypeBase (ShapeBase SubExp) NoUniqueness
paramType Param (MemInfo SubExp NoUniqueness MemBind)
p =
          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
p) [] (VName -> SubExp
Var VName
arr) [TExp Int64
ltid]
      | Bool
otherwise =
          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
p) [] (VName -> SubExp
Var VName
arr) [TExp Int64
gtid]
    readParam :: TExp Int64
-> Param (MemInfo SubExp NoUniqueness MemBind)
-> VName
-> ImpM GPUMem KernelEnv KernelOp ()
readParam TExp Int64
behind Param (MemInfo SubExp NoUniqueness MemBind)
p VName
arr
      | TypeBase (ShapeBase SubExp) NoUniqueness -> Bool
forall shape u. TypeBase shape u -> Bool
primType (TypeBase (ShapeBase SubExp) NoUniqueness -> Bool)
-> TypeBase (ShapeBase SubExp) NoUniqueness -> Bool
forall a b. (a -> b) -> a -> b
$ Param (MemInfo SubExp NoUniqueness MemBind)
-> TypeBase (ShapeBase SubExp) NoUniqueness
forall dec.
Typed dec =>
Param dec -> TypeBase (ShapeBase SubExp) NoUniqueness
paramType Param (MemInfo SubExp NoUniqueness MemBind)
p =
          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
p) [] (VName -> SubExp
Var VName
arr) [TExp Int64
ltid TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
behind]
      | Bool
otherwise =
          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
p) [] (VName -> SubExp
Var VName
arr) [TExp Int64
gtid TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
behind TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
arrs_full_size]

    writeResult :: Param (MemInfo SubExp NoUniqueness MemBind)
-> Param (MemInfo SubExp NoUniqueness MemBind)
-> VName
-> ImpM GPUMem KernelEnv KernelOp ()
writeResult Param (MemInfo SubExp NoUniqueness MemBind)
x Param (MemInfo SubExp NoUniqueness MemBind)
y VName
arr = do
      Bool
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Param (MemInfo SubExp NoUniqueness MemBind) -> Bool
forall p. Typed p => Param p -> Bool
isPrimParam Param (MemInfo SubExp NoUniqueness MemBind)
x) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
        VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
arr [TExp Int64
ltid] (VName -> SubExp
Var (VName -> SubExp) -> VName -> SubExp
forall a b. (a -> b) -> a -> b
$ Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
x) []
      VName
-> [DimIndex (TExp Int64)]
-> SubExp
-> [DimIndex (TExp Int64)]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName
-> [DimIndex (TExp Int64)]
-> SubExp
-> [DimIndex (TExp Int64)]
-> ImpM rep r op ()
copyDWIM (Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
y) [] (VName -> SubExp
Var (VName -> SubExp) -> VName -> SubExp
forall a b. (a -> b) -> a -> b
$ Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName Param (MemInfo SubExp NoUniqueness MemBind)
x) []

-- | Compile 'SegScan' instance to host-level code with calls to a
-- single-pass kernel.
compileSegScan ::
  Pat LetDecMem ->
  SegLevel ->
  SegSpace ->
  SegBinOp GPUMem ->
  KernelBody GPUMem ->
  CallKernelGen ()
compileSegScan :: Pat (MemInfo SubExp NoUniqueness MemBind)
-> SegLevel
-> SegSpace
-> SegBinOp GPUMem
-> KernelBody GPUMem
-> CallKernelGen ()
compileSegScan Pat (MemInfo SubExp NoUniqueness MemBind)
pat SegLevel
lvl SegSpace
space SegBinOp GPUMem
scan_op KernelBody GPUMem
map_kbody = do
  KernelAttrs
attrs <- SegLevel -> CallKernelGen KernelAttrs
lvlKernelAttrs SegLevel
lvl
  let Pat [PatElem (MemInfo SubExp NoUniqueness MemBind)]
all_pes = Pat (MemInfo SubExp NoUniqueness MemBind)
pat

      scanop_nes :: [SubExp]
scanop_nes = SegBinOp GPUMem -> [SubExp]
forall rep. SegBinOp rep -> [SubExp]
segBinOpNeutral SegBinOp GPUMem
scan_op

      n :: TExp Int64
n = [TExp Int64] -> TExp Int64
forall a. Num a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product ([TExp Int64] -> TExp Int64) -> [TExp Int64] -> TExp Int64
forall a b. (a -> b) -> a -> b
$ (SubExp -> TExp Int64) -> [SubExp] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map SubExp -> TExp Int64
pe64 ([SubExp] -> [TExp Int64]) -> [SubExp] -> [TExp Int64]
forall a b. (a -> b) -> a -> b
$ SegSpace -> [SubExp]
segSpaceDims SegSpace
space

      tys' :: [TypeBase (ShapeBase SubExp) NoUniqueness]
tys' = Lambda GPUMem -> [TypeBase (ShapeBase SubExp) NoUniqueness]
forall rep.
Lambda rep -> [TypeBase (ShapeBase SubExp) NoUniqueness]
lambdaReturnType (Lambda GPUMem -> [TypeBase (ShapeBase SubExp) NoUniqueness])
-> Lambda GPUMem -> [TypeBase (ShapeBase SubExp) NoUniqueness]
forall a b. (a -> b) -> a -> b
$ SegBinOp GPUMem -> Lambda GPUMem
forall rep. SegBinOp rep -> Lambda rep
segBinOpLambda SegBinOp GPUMem
scan_op

      tys :: [PrimType]
tys = (TypeBase (ShapeBase SubExp) NoUniqueness -> PrimType)
-> [TypeBase (ShapeBase SubExp) NoUniqueness] -> [PrimType]
forall a b. (a -> b) -> [a] -> [b]
map TypeBase (ShapeBase SubExp) NoUniqueness -> PrimType
forall shape u. TypeBase shape u -> PrimType
elemType [TypeBase (ShapeBase SubExp) NoUniqueness]
tys'

      tblock_size_e :: TExp Int64
tblock_size_e = SubExp -> TExp Int64
pe64 (SubExp -> TExp Int64) -> SubExp -> TExp Int64
forall a b. (a -> b) -> a -> b
$ Count BlockSize SubExp -> SubExp
forall {k} (u :: k) e. Count u e -> e
unCount (Count BlockSize SubExp -> SubExp)
-> Count BlockSize SubExp -> SubExp
forall a b. (a -> b) -> a -> b
$ KernelAttrs -> Count BlockSize SubExp
kAttrBlockSize KernelAttrs
attrs
      num_phys_blocks_e :: TExp Int64
num_phys_blocks_e = SubExp -> TExp Int64
pe64 (SubExp -> TExp Int64) -> SubExp -> TExp Int64
forall a b. (a -> b) -> a -> b
$ Count NumBlocks SubExp -> SubExp
forall {k} (u :: k) e. Count u e -> e
unCount (Count NumBlocks SubExp -> SubExp)
-> Count NumBlocks SubExp -> SubExp
forall a b. (a -> b) -> a -> b
$ KernelAttrs -> Count NumBlocks SubExp
kAttrNumBlocks KernelAttrs
attrs

  let chunk_const :: KernelConstExp
chunk_const = [TypeBase (ShapeBase SubExp) NoUniqueness] -> KernelConstExp
getChunkSize [TypeBase (ShapeBase SubExp) NoUniqueness]
tys'
  TV Int64
chunk_v <- String -> TExp Int64 -> ImpM GPUMem HostEnv HostOp (TV Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"chunk_size" (TExp Int64 -> ImpM GPUMem HostEnv HostOp (TV Int64))
-> (PrimExp VName -> TExp Int64)
-> PrimExp VName
-> ImpM GPUMem HostEnv HostOp (TV Int64)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. PrimExp VName -> TExp Int64
forall v. PrimExp v -> TPrimExp Int64 v
isInt64 (PrimExp VName -> ImpM GPUMem HostEnv HostOp (TV Int64))
-> ImpM GPUMem HostEnv HostOp (PrimExp VName)
-> ImpM GPUMem HostEnv HostOp (TV Int64)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< KernelConstExp -> ImpM GPUMem HostEnv HostOp (PrimExp VName)
kernelConstToExp KernelConstExp
chunk_const
  let chunk :: TExp Int64
chunk = TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
chunk_v

  SubExp
num_virt_blocks <-
    TV Int64 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize (TV Int64 -> SubExp)
-> ImpM GPUMem HostEnv HostOp (TV Int64)
-> ImpM GPUMem HostEnv HostOp SubExp
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String -> TExp Int64 -> ImpM GPUMem HostEnv HostOp (TV Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"num_virt_blocks" (TExp Int64
n TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
`divUp` (TExp Int64
tblock_size_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk))
  let num_virt_blocks_e :: TExp Int64
num_virt_blocks_e = SubExp -> TExp Int64
pe64 SubExp
num_virt_blocks

  TExp Int64
num_virt_threads <-
    String -> TExp Int64 -> ImpM GPUMem HostEnv HostOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"num_virt_threads" (TExp Int64 -> ImpM GPUMem HostEnv HostOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem HostEnv HostOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
num_virt_blocks_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e

  let ([VName]
gtids, [SubExp]
dims) = [(VName, SubExp)] -> ([VName], [SubExp])
forall a b. [(a, b)] -> ([a], [b])
unzip ([(VName, SubExp)] -> ([VName], [SubExp]))
-> [(VName, SubExp)] -> ([VName], [SubExp])
forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
      dims' :: [TExp Int64]
dims' = (SubExp -> TExp Int64) -> [SubExp] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map SubExp -> TExp Int64
pe64 [SubExp]
dims
      segmented :: Bool
segmented = [TExp Int64] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [TExp Int64]
dims' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1
      not_segmented_e :: TPrimExp Bool VName
not_segmented_e = Bool -> TPrimExp Bool VName
forall v. Bool -> TPrimExp Bool v
fromBool (Bool -> TPrimExp Bool VName) -> Bool -> TPrimExp Bool VName
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not Bool
segmented
      segment_size :: TExp Int64
segment_size = [TExp Int64] -> TExp Int64
forall a. HasCallStack => [a] -> a
last [TExp Int64]
dims'

  Code HostOp -> CallKernelGen ()
forall op rep r. Code op -> ImpM rep r op ()
emit (Code HostOp -> CallKernelGen ())
-> Code HostOp -> CallKernelGen ()
forall a b. (a -> b) -> a -> b
$ String -> Maybe (PrimExp VName) -> Code HostOp
forall a. String -> Maybe (PrimExp VName) -> Code a
Imp.DebugPrint String
"Sequential elements per thread (chunk)" (Maybe (PrimExp VName) -> Code HostOp)
-> Maybe (PrimExp VName) -> Code HostOp
forall a b. (a -> b) -> a -> b
$ PrimExp VName -> Maybe (PrimExp VName)
forall a. a -> Maybe a
Just (PrimExp VName -> Maybe (PrimExp VName))
-> PrimExp VName -> Maybe (PrimExp VName)
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> PrimExp VName
forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
chunk

  VName
statusFlags <- String
-> PrimType
-> ShapeBase SubExp
-> Space
-> ImpM GPUMem HostEnv HostOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> Space -> ImpM rep r op VName
sAllocArray String
"status_flags" PrimType
int8 ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [SubExp
num_virt_blocks]) (String -> Space
Space String
"device")
  VName -> SubExp -> CallKernelGen ()
sReplicate VName
statusFlags (SubExp -> CallKernelGen ()) -> SubExp -> CallKernelGen ()
forall a b. (a -> b) -> a -> b
$ IntType -> Integer -> SubExp
intConst IntType
Int8 Integer
forall a. Num a => a
statusX

  ([VName]
aggregateArrays, [VName]
incprefixArrays) <-
    ([(VName, VName)] -> ([VName], [VName]))
-> ImpM GPUMem HostEnv HostOp [(VName, VName)]
-> ImpM GPUMem HostEnv HostOp ([VName], [VName])
forall a b.
(a -> b)
-> ImpM GPUMem HostEnv HostOp a -> ImpM GPUMem HostEnv HostOp b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap [(VName, VName)] -> ([VName], [VName])
forall a b. [(a, b)] -> ([a], [b])
unzip (ImpM GPUMem HostEnv HostOp [(VName, VName)]
 -> ImpM GPUMem HostEnv HostOp ([VName], [VName]))
-> ImpM GPUMem HostEnv HostOp [(VName, VName)]
-> ImpM GPUMem HostEnv HostOp ([VName], [VName])
forall a b. (a -> b) -> a -> b
$
      [PrimType]
-> (PrimType -> ImpM GPUMem HostEnv HostOp (VName, VName))
-> ImpM GPUMem HostEnv HostOp [(VName, VName)]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [PrimType]
tys ((PrimType -> ImpM GPUMem HostEnv HostOp (VName, VName))
 -> ImpM GPUMem HostEnv HostOp [(VName, VName)])
-> (PrimType -> ImpM GPUMem HostEnv HostOp (VName, VName))
-> ImpM GPUMem HostEnv HostOp [(VName, VName)]
forall a b. (a -> b) -> a -> b
$ \PrimType
ty ->
        (,)
          (VName -> VName -> (VName, VName))
-> ImpM GPUMem HostEnv HostOp VName
-> ImpM GPUMem HostEnv HostOp (VName -> (VName, VName))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String
-> PrimType
-> ShapeBase SubExp
-> Space
-> ImpM GPUMem HostEnv HostOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> Space -> ImpM rep r op VName
sAllocArray String
"aggregates" PrimType
ty ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [SubExp
num_virt_blocks]) (String -> Space
Space String
"device")
          ImpM GPUMem HostEnv HostOp (VName -> (VName, VName))
-> ImpM GPUMem HostEnv HostOp VName
-> ImpM GPUMem HostEnv HostOp (VName, VName)
forall a b.
ImpM GPUMem HostEnv HostOp (a -> b)
-> ImpM GPUMem HostEnv HostOp a -> ImpM GPUMem HostEnv HostOp b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> String
-> PrimType
-> ShapeBase SubExp
-> Space
-> ImpM GPUMem HostEnv HostOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> Space -> ImpM rep r op VName
sAllocArray String
"incprefixes" PrimType
ty ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [SubExp
num_virt_blocks]) (String -> Space
Space String
"device")

  VName
global_id <- String -> Int -> ImpM GPUMem HostEnv HostOp VName
genZeroes String
"global_dynid" Int
1

  let attrs' :: KernelAttrs
attrs' = KernelAttrs
attrs {kAttrConstExps :: Map VName KernelConstExp
kAttrConstExps = VName -> KernelConstExp -> Map VName KernelConstExp
forall k a. k -> a -> Map k a
M.singleton (TV Int64 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int64
chunk_v) KernelConstExp
chunk_const}

  String
-> VName
-> KernelAttrs
-> ImpM GPUMem KernelEnv KernelOp ()
-> CallKernelGen ()
sKernelThread String
"segscan" (SegSpace -> VName
segFlat SegSpace
space) KernelAttrs
attrs' (ImpM GPUMem KernelEnv KernelOp () -> CallKernelGen ())
-> ImpM GPUMem KernelEnv KernelOp () -> CallKernelGen ()
forall a b. (a -> b) -> a -> b
$ do
    TExp Int32
chunk32 <- String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"chunk_size_32b" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 (TExp Int64 -> TExp Int32) -> TExp Int64 -> TExp Int32
forall a b. (a -> b) -> a -> b
$ TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
chunk_v

    KernelConstants
constants <- KernelEnv -> KernelConstants
kernelConstants (KernelEnv -> KernelConstants)
-> ImpM GPUMem KernelEnv KernelOp KernelEnv
-> ImpM GPUMem KernelEnv KernelOp KernelConstants
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ImpM GPUMem KernelEnv KernelOp KernelEnv
forall rep r op. ImpM rep r op r
askEnv

    let ltid32 :: TExp Int32
ltid32 = KernelConstants -> TExp Int32
kernelLocalThreadId KernelConstants
constants
        ltid :: TExp Int64
ltid = TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int32
ltid32

    (VName
sharedId, [VName]
transposedArrays, [VName]
prefixArrays, VName
warpscan, [VName]
exchanges) <-
      Count BlockSize SubExp
-> SubExp
-> [PrimType]
-> InKernelGen (VName, [VName], [VName], VName, [VName])
createLocalArrays (KernelAttrs -> Count BlockSize SubExp
kAttrBlockSize KernelAttrs
attrs) (TV Int64 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Int64
chunk_v) [PrimType]
tys

    -- We wrap the entire kernel body in a virtualisation loop to
    -- handle the case where we do not have enough thread blocks to
    -- cover the iteration space. Dynamic block indexing has no
    -- implication on this, since each block simply fetches a new
    -- dynamic ID upon entry into the virtualisation loop.
    --
    -- We could use virtualiseBlocks, but this introduces a barrier which is
    -- redundant in this case, and also we don't need to base virtual block IDs
    -- on the loop variable, but rather on the dynamic IDs.
    TV Int64
phys_block_id <- String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"phys_block_id" PrimType
int32
    KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp (KernelOp -> ImpM GPUMem KernelEnv KernelOp ())
-> KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ VName -> Int -> KernelOp
Imp.GetBlockId (TV Int64 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int64
phys_block_id) Int
0
    TExp Int64
iters <-
      String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"virtloop_bound" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$
        (TExp Int64
num_virt_blocks_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
phys_block_id)
          TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
`divUp` TExp Int64
num_phys_blocks_e

    String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"virtloop_i" TExp Int64
iters ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ ImpM GPUMem KernelEnv KernelOp ()
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. a -> b -> a
const (ImpM GPUMem KernelEnv KernelOp ()
 -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> TExp Int64
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
      TV Int64
dyn_id <- String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"dynamic_id" PrimType
int32
      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"First thread in block fetches this block's dynamic_id" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
          (VName
globalIdMem, Space
_, Count Elements (TExp Int64)
globalIdOff) <- VName
-> [TExp Int64]
-> ImpM
     GPUMem
     KernelEnv
     KernelOp
     (VName, Space, Count Elements (TExp Int64))
forall rep r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray VName
global_id [TExp Int64
0]
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp (KernelOp -> ImpM GPUMem KernelEnv KernelOp ())
-> KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
            Space -> AtomicOp -> KernelOp
Imp.Atomic Space
DefaultSpace (AtomicOp -> KernelOp) -> AtomicOp -> KernelOp
forall a b. (a -> b) -> a -> b
$
              IntType
-> VName
-> VName
-> Count Elements (TExp Int64)
-> PrimExp VName
-> AtomicOp
Imp.AtomicAdd
                IntType
Int32
                (TV Int64 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int64
dyn_id)
                VName
globalIdMem
                (TExp Int64 -> Count Elements (TExp Int64)
forall {k} (u :: k) e. e -> Count u e
Count (TExp Int64 -> Count Elements (TExp Int64))
-> TExp Int64 -> Count Elements (TExp Int64)
forall a b. (a -> b) -> a -> b
$ Count Elements (TExp Int64) -> TExp Int64
forall {k} (u :: k) e. Count u e -> e
unCount Count Elements (TExp Int64)
globalIdOff)
                (TExp Int32 -> PrimExp VName
forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (TExp Int32
1 :: Imp.TExp Int32))
          Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Set dynamic id for this block" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
sharedId [TExp Int64
0] (TV Int64 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Int64
dyn_id) []

          Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"First thread in last (virtual) block resets global dynamic_id" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
            TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id TExp Int64 -> TExp Int64 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int64
num_virt_blocks_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
global_id [TExp Int64
0] (IntType -> Integer -> SubExp
intConst IntType
Int32 Integer
0) []

      let local_barrier :: KernelOp
local_barrier = Fence -> KernelOp
Imp.Barrier Fence
Imp.FenceLocal
          local_fence :: KernelOp
local_fence = Fence -> KernelOp
Imp.MemFence Fence
Imp.FenceLocal
          global_fence :: KernelOp
global_fence = Fence -> KernelOp
Imp.MemFence Fence
Imp.FenceGlobal

      KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier
      VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Int64 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int64
dyn_id) [] (VName -> SubExp
Var VName
sharedId) [TExp Int64
0]
      KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier

      TExp Int64
block_offset <-
        String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"block_offset" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$
          TExp Int64 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id) TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e
      TExp Int64
sgm_idx <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"sgm_idx" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
block_offset TExp Int64 -> TExp Int64 -> TExp Int64
forall e. IntegralExp e => e -> e -> e
`mod` TExp Int64
segment_size
      TExp Int32
boundary <-
        String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"boundary" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall a b. (a -> b) -> a -> b
$
          TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 (TExp Int64 -> TExp Int32) -> TExp Int64 -> TExp Int32
forall a b. (a -> b) -> a -> b
$
            TExp Int64 -> TExp Int64 -> TExp Int64
forall v. TPrimExp Int64 v -> TPrimExp Int64 v -> TPrimExp Int64 v
sMin64 (TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e) (TExp Int64
segment_size TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
sgm_idx)
      TExp Int32
segsize_compact <-
        String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"segsize_compact" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall a b. (a -> b) -> a -> b
$
          TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 (TExp Int64 -> TExp Int32) -> TExp Int64 -> TExp Int32
forall a b. (a -> b) -> a -> b
$
            TExp Int64 -> TExp Int64 -> TExp Int64
forall v. TPrimExp Int64 v -> TPrimExp Int64 v -> TPrimExp Int64 v
sMin64 (TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e) TExp Int64
segment_size
      [VName]
private_chunks <-
        [PrimType]
-> (PrimType -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [PrimType]
tys ((PrimType -> ImpM GPUMem KernelEnv KernelOp VName)
 -> ImpM GPUMem KernelEnv KernelOp [VName])
-> (PrimType -> ImpM GPUMem KernelEnv KernelOp VName)
-> ImpM GPUMem KernelEnv KernelOp [VName]
forall a b. (a -> b) -> a -> b
$ \PrimType
ty ->
          String
-> PrimType
-> ShapeBase SubExp
-> Space
-> ImpM GPUMem KernelEnv KernelOp VName
forall rep r op.
String
-> PrimType -> ShapeBase SubExp -> Space -> ImpM rep r op VName
sAllocArray
            String
"private"
            PrimType
ty
            ([SubExp] -> ShapeBase SubExp
forall d. [d] -> ShapeBase d
Shape [TV Int64 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Int64
chunk_v])
            ([SubExp] -> PrimType -> Space
ScalarSpace [TV Int64 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Int64
chunk_v] PrimType
ty)

      TExp Int64
thd_offset <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"thd_offset" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
block_offset TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
ltid

      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Load and map" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
        String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" TExp Int64
chunk ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
          -- The map's input index
          TExp Int64
virt_tid <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"virt_tid" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
thd_offset TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
i TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e
          [(VName, TExp Int64)]
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
[(VName, TExp Int64)] -> TExp Int64 -> ImpM rep r op ()
dIndexSpace ([VName] -> [TExp Int64] -> [(VName, TExp Int64)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
gtids [TExp Int64]
dims') TExp Int64
virt_tid
          -- Perform the map
          let in_bounds :: ImpM GPUMem KernelEnv KernelOp ()
in_bounds =
                Names
-> Stms GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
Names -> Stms rep -> ImpM rep r op () -> ImpM rep r op ()
compileStms Names
forall a. Monoid a => a
mempty (KernelBody GPUMem -> Stms GPUMem
forall rep. KernelBody rep -> Stms rep
kernelBodyStms KernelBody GPUMem
map_kbody) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                  let ([KernelResult]
all_scan_res, [KernelResult]
map_res) =
                        Int -> [KernelResult] -> ([KernelResult], [KernelResult])
forall a. Int -> [a] -> ([a], [a])
splitAt ([SegBinOp GPUMem] -> Int
forall rep. [SegBinOp rep] -> Int
segBinOpResults [SegBinOp GPUMem
scan_op]) ([KernelResult] -> ([KernelResult], [KernelResult]))
-> [KernelResult] -> ([KernelResult], [KernelResult])
forall a b. (a -> b) -> a -> b
$ KernelBody GPUMem -> [KernelResult]
forall rep. KernelBody rep -> [KernelResult]
kernelBodyResult KernelBody GPUMem
map_kbody

                  -- Write map results to their global memory destinations
                  [(PatElem (MemInfo SubExp NoUniqueness MemBind), KernelResult)]
-> ((PatElem (MemInfo SubExp NoUniqueness MemBind), KernelResult)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([PatElem (MemInfo SubExp NoUniqueness MemBind)]
-> [KernelResult]
-> [(PatElem (MemInfo SubExp NoUniqueness MemBind), KernelResult)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Int
-> [PatElem (MemInfo SubExp NoUniqueness MemBind)]
-> [PatElem (MemInfo SubExp NoUniqueness MemBind)]
forall a. Int -> [a] -> [a]
takeLast ([KernelResult] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [KernelResult]
map_res) [PatElem (MemInfo SubExp NoUniqueness MemBind)]
all_pes) [KernelResult]
map_res) (((PatElem (MemInfo SubExp NoUniqueness MemBind), KernelResult)
  -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((PatElem (MemInfo SubExp NoUniqueness MemBind), KernelResult)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(PatElem (MemInfo SubExp NoUniqueness MemBind)
dest, KernelResult
src) ->
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (PatElem (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. PatElem dec -> VName
patElemName PatElem (MemInfo SubExp NoUniqueness MemBind)
dest) ((VName -> TExp Int64) -> [VName] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map VName -> TExp Int64
forall a. a -> TPrimExp Int64 a
Imp.le64 [VName]
gtids) (KernelResult -> SubExp
kernelResultSubExp KernelResult
src) []

                  -- Write to-scan results to private memory.
                  [(VName, SubExp)]
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [SubExp] -> [(VName, SubExp)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
private_chunks ([SubExp] -> [(VName, SubExp)]) -> [SubExp] -> [(VName, SubExp)]
forall a b. (a -> b) -> a -> b
$ (KernelResult -> SubExp) -> [KernelResult] -> [SubExp]
forall a b. (a -> b) -> [a] -> [b]
map KernelResult -> SubExp
kernelResultSubExp [KernelResult]
all_scan_res) (((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
dest, SubExp
src) ->
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
dest [TExp Int64
i] SubExp
src []

              out_of_bounds :: ImpM GPUMem KernelEnv KernelOp ()
out_of_bounds =
                [(VName, SubExp)]
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [SubExp] -> [(VName, SubExp)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
private_chunks [SubExp]
scanop_nes) (((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
dest, SubExp
ne) ->
                  VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
dest [TExp Int64
i] SubExp
ne []

          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf (TExp Int64
virt_tid TExp Int64 -> TExp Int64 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TExp Int64
n) ImpM GPUMem KernelEnv KernelOp ()
in_bounds ImpM GPUMem KernelEnv KernelOp ()
out_of_bounds

      KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp (KernelOp -> ImpM GPUMem KernelEnv KernelOp ())
-> KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ Fence -> KernelOp
Imp.ErrorSync Fence
Imp.FenceLocal
      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Transpose scan inputs" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        [(VName, VName)]
-> ((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [VName] -> [(VName, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
transposedArrays [VName]
private_chunks) (((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
trans, VName
priv) -> do
          String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" TExp Int64
chunk ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
            TExp Int64
sharedIdx <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"sharedIdx" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
ltid TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
i TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
trans [TExp Int64
sharedIdx] (VName -> SubExp
Var VName
priv) [TExp Int64
i]
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier
          String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" TExp Int64
chunk ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
            TV Int64
sharedIdx <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"sharedIdx" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
ltid TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
i
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
priv [TExp Int64 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int64
i] (VName -> SubExp
Var VName
trans) [TExp Int64 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int64 -> TExp Int64) -> TExp Int64 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
sharedIdx]
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier

      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Per thread scan" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        -- We don't need to touch the first element, so only m-1
        -- iterations here.
        String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" (TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1) ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
          let xs :: [VName]
xs = (LParam GPUMem -> VName) -> [LParam GPUMem] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map LParam GPUMem -> VName
Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([LParam GPUMem] -> [VName]) -> [LParam GPUMem] -> [VName]
forall a b. (a -> b) -> a -> b
$ SegBinOp GPUMem -> [LParam GPUMem]
xParams SegBinOp GPUMem
scan_op
              ys :: [VName]
ys = (LParam GPUMem -> VName) -> [LParam GPUMem] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map LParam GPUMem -> VName
Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([LParam GPUMem] -> [VName]) -> [LParam GPUMem] -> [VName]
forall a b. (a -> b) -> a -> b
$ SegBinOp GPUMem -> [LParam GPUMem]
yParams SegBinOp GPUMem
scan_op
          -- determine if start of segment
          TPrimExp Bool VName
new_sgm <-
            if Bool
segmented
              then do
                TExp Int32
gidx <- String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"gidx" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall a b. (a -> b) -> a -> b
$ (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
chunk32) TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
1
                String
-> TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"new_sgm" (TPrimExp Bool VName
 -> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName))
-> TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName)
forall a b. (a -> b) -> a -> b
$ (TExp Int32
gidx TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 TExp Int64
i TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
boundary) TExp Int32 -> TExp Int32 -> TExp Int32
forall e. IntegralExp e => e -> e -> e
`mod` TExp Int32
segsize_compact TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0
              else TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName)
forall a. a -> ImpM GPUMem KernelEnv KernelOp a
forall (f :: * -> *) a. Applicative f => a -> f a
pure TPrimExp Bool VName
forall v. TPrimExp Bool v
false
          -- skip scan of first element in segment
          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sUnless TPrimExp Bool VName
new_sgm (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
            [(VName, VName, VName, PrimType)]
-> ((VName, VName, VName, PrimType)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName]
-> [VName]
-> [VName]
-> [PrimType]
-> [(VName, VName, VName, PrimType)]
forall a b c d. [a] -> [b] -> [c] -> [d] -> [(a, b, c, d)]
zip4 [VName]
private_chunks [VName]
xs [VName]
ys [PrimType]
tys) (((VName, VName, VName, PrimType)
  -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, VName, VName, PrimType)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
src, VName
x, VName
y, PrimType
ty) -> do
              VName -> PrimType -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. VName -> PrimType -> ImpM rep r op ()
dPrim_ VName
x PrimType
ty
              VName -> PrimType -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. VName -> PrimType -> ImpM rep r op ()
dPrim_ VName
y PrimType
ty
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
x [] (VName -> SubExp
Var VName
src) [TExp Int64
i]
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
y [] (VName -> SubExp
Var VName
src) [TExp Int64
i TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
1]

            Names
-> Stms GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
Names -> Stms rep -> ImpM rep r op () -> ImpM rep r op ()
compileStms Names
forall a. Monoid a => a
mempty (Body GPUMem -> Stms GPUMem
forall rep. Body rep -> Stms rep
bodyStms (Body GPUMem -> Stms GPUMem) -> Body GPUMem -> Stms GPUMem
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody (Lambda GPUMem -> Body GPUMem) -> Lambda GPUMem -> Body GPUMem
forall a b. (a -> b) -> a -> b
$ SegBinOp GPUMem -> Lambda GPUMem
forall rep. SegBinOp rep -> Lambda rep
segBinOpLambda SegBinOp GPUMem
scan_op) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
              [(VName, SubExp)]
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [SubExp] -> [(VName, SubExp)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
private_chunks ([SubExp] -> [(VName, SubExp)]) -> [SubExp] -> [(VName, SubExp)]
forall a b. (a -> b) -> a -> b
$ (SubExpRes -> SubExp) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> [a] -> [b]
map SubExpRes -> SubExp
resSubExp ([SubExpRes] -> [SubExp]) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> a -> b
$ Body GPUMem -> [SubExpRes]
forall rep. Body rep -> [SubExpRes]
bodyResult (Body GPUMem -> [SubExpRes]) -> Body GPUMem -> [SubExpRes]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody (Lambda GPUMem -> Body GPUMem) -> Lambda GPUMem -> Body GPUMem
forall a b. (a -> b) -> a -> b
$ SegBinOp GPUMem -> Lambda GPUMem
forall rep. SegBinOp rep -> Lambda rep
segBinOpLambda SegBinOp GPUMem
scan_op) (((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
dest, SubExp
res) ->
                VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
dest [TExp Int64
i TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
1] SubExp
res []

      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Publish results in shared memory" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        [(VName, VName)]
-> ((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [VName] -> [(VName, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
prefixArrays [VName]
private_chunks) (((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
dest, VName
src) ->
          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
dest [TExp Int64
ltid] (VName -> SubExp
Var VName
src) [TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
        KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier

      let crossesSegment :: Maybe (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
crossesSegment = do
            Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard Bool
segmented
            (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
-> Maybe (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
forall a. a -> Maybe a
Just ((TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
 -> Maybe (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName))
-> (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
-> Maybe (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
forall a b. (a -> b) -> a -> b
$ \TExp Int32
from TExp Int32
to ->
              let from' :: TExp Int32
from' = (TExp Int32
from TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
1) TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
chunk32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
1
                  to' :: TExp Int32
to' = (TExp Int32
to TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
1) TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
chunk32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
1
               in (TExp Int32
to' TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
from') TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.>. (TExp Int32
to' TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
segsize_compact TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
boundary) TExp Int32 -> TExp Int32 -> TExp Int32
forall e. IntegralExp e => e -> e -> e
`mod` TExp Int32
segsize_compact

      Lambda GPUMem
scan_op1 <- Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda (Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem))
-> Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall a b. (a -> b) -> a -> b
$ SegBinOp GPUMem -> Lambda GPUMem
forall rep. SegBinOp rep -> Lambda rep
segBinOpLambda SegBinOp GPUMem
scan_op

      [TV Any]
accs <- (PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> [PrimType] -> ImpM GPUMem KernelEnv KernelOp [TV Any]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM (String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Any)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"acc") [PrimType]
tys
      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Scan results (with warp scan)" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        Maybe (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
-> TExp Int64
-> TExp Int64
-> Lambda GPUMem
-> [VName]
-> ImpM GPUMem KernelEnv KernelOp ()
blockScan
          Maybe (TExp Int32 -> TExp Int32 -> TPrimExp Bool VName)
crossesSegment
          TExp Int64
tblock_size_e
          TExp Int64
num_virt_threads
          Lambda GPUMem
scan_op1
          [VName]
prefixArrays

        KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp (KernelOp -> ImpM GPUMem KernelEnv KernelOp ())
-> KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ Fence -> KernelOp
Imp.ErrorSync Fence
Imp.FenceLocal

        let firstThread :: TV Any -> VName -> ImpM GPUMem KernelEnv KernelOp ()
firstThread TV Any
acc VName
prefixes =
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
acc) [] (VName -> SubExp
Var VName
prefixes) [TExp Int64 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int64
tblock_size_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
            notFirstThread :: TV Any -> VName -> ImpM GPUMem KernelEnv KernelOp ()
notFirstThread TV Any
acc VName
prefixes =
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
acc) [] (VName -> SubExp
Var VName
prefixes) [TExp Int64
ltid TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
          (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0)
          ((TV Any -> VName -> ImpM GPUMem KernelEnv KernelOp ())
-> [TV Any] -> [VName] -> ImpM GPUMem KernelEnv KernelOp ()
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ TV Any -> VName -> ImpM GPUMem KernelEnv KernelOp ()
firstThread [TV Any]
accs [VName]
prefixArrays)
          ((TV Any -> VName -> ImpM GPUMem KernelEnv KernelOp ())
-> [TV Any] -> [VName] -> ImpM GPUMem KernelEnv KernelOp ()
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ TV Any -> VName -> ImpM GPUMem KernelEnv KernelOp ()
notFirstThread [TV Any]
accs [VName]
prefixArrays)

        KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier

      [TV Any]
prefixes <- [(SubExp, PrimType)]
-> ((SubExp, PrimType) -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> ImpM GPUMem KernelEnv KernelOp [TV Any]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM ([SubExp] -> [PrimType] -> [(SubExp, PrimType)]
forall a b. [a] -> [b] -> [(a, b)]
zip [SubExp]
scanop_nes [PrimType]
tys) (((SubExp, PrimType) -> ImpM GPUMem KernelEnv KernelOp (TV Any))
 -> ImpM GPUMem KernelEnv KernelOp [TV Any])
-> ((SubExp, PrimType) -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> ImpM GPUMem KernelEnv KernelOp [TV Any]
forall a b. (a -> b) -> a -> b
$ \(SubExp
ne, PrimType
ty) ->
        String -> TExp Any -> ImpM GPUMem KernelEnv KernelOp (TV Any)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"prefix" (TExp Any -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> TExp Any -> ImpM GPUMem KernelEnv KernelOp (TV Any)
forall a b. (a -> b) -> a -> b
$ PrimExp VName -> TExp Any
forall {k} (t :: k) v. PrimExp v -> TPrimExp t v
TPrimExp (PrimExp VName -> TExp Any) -> PrimExp VName -> TExp Any
forall a b. (a -> b) -> a -> b
$ PrimType -> SubExp -> PrimExp VName
forall a. ToExp a => PrimType -> a -> PrimExp VName
toExp' PrimType
ty SubExp
ne
      TPrimExp Bool VName
blockNewSgm <- String
-> TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"block_new_sgm" (TPrimExp Bool VName
 -> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName))
-> TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp (TPrimExp Bool VName)
forall a b. (a -> b) -> a -> b
$ TExp Int64
sgm_idx TExp Int64 -> TExp Int64 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int64
0
      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Perform lookback" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TPrimExp Bool VName
blockNewSgm TPrimExp Bool VName -> TPrimExp Bool VName -> TPrimExp Bool VName
forall v.
Eq v =>
TPrimExp Bool v -> TPrimExp Bool v -> TPrimExp Bool v
.&&. TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
          ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
            [(TV Any, VName)]
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any] -> [VName] -> [(TV Any, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [TV Any]
accs [VName]
incprefixArrays) (((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(TV Any
acc, VName
incprefixArray) ->
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
incprefixArray [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (TV Any -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Any
acc) []
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
global_fence
          ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
statusFlags [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (IntType -> Integer -> SubExp
intConst IntType
Int8 Integer
forall a. Num a => a
statusP) []
          [(SubExp, TV Any)]
-> ((SubExp, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([SubExp] -> [TV Any] -> [(SubExp, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [SubExp]
scanop_nes [TV Any]
accs) (((SubExp, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((SubExp, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(SubExp
ne, TV Any
acc) ->
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
acc) [] SubExp
ne []
        -- end sWhen

        let warp_size :: TExp Int32
warp_size = KernelConstants -> TExp Int32
kernelWaveSize KernelConstants
constants
        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TPrimExp Bool VName -> TPrimExp Bool VName
forall v. TPrimExp Bool v -> TPrimExp Bool v
bNot TPrimExp Bool VName
blockNewSgm TPrimExp Bool VName -> TPrimExp Bool VName -> TPrimExp Bool VName
forall v.
Eq v =>
TPrimExp Bool v -> TPrimExp Bool v -> TPrimExp Bool v
.&&. TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TExp Int32
warp_size) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
            TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
              (TPrimExp Bool VName
not_segmented_e TPrimExp Bool VName -> TPrimExp Bool VName -> TPrimExp Bool VName
forall v.
Eq v =>
TPrimExp Bool v -> TPrimExp Bool v -> TPrimExp Bool v
.||. TExp Int32
boundary TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 (TExp Int64
tblock_size_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk))
              ( do
                  ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                    [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
aggregateArrays [TV Any]
accs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
aggregateArray, TV Any
acc) ->
                      VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
aggregateArray [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (TV Any -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Any
acc) []
                  KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
global_fence
                  ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
statusFlags [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (IntType -> Integer -> SubExp
intConst IntType
Int8 Integer
forall a. Num a => a
statusA) []
              )
              ( do
                  ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                    [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
incprefixArrays [TV Any]
accs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
incprefixArray, TV Any
acc) ->
                      VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
incprefixArray [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (TV Any -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Any
acc) []
                  KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
global_fence
                  ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
statusFlags [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (IntType -> Integer -> SubExp
intConst IntType
Int8 Integer
forall a. Num a => a
statusP) []
              )
            ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
warpscan [TExp Int64
0] (VName -> SubExp
Var VName
statusFlags) [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
          -- sWhen
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_fence

          TV Int8
status <- String -> PrimType -> ImpM GPUMem KernelEnv KernelOp (TV Int8)
forall {k} rep r op (t :: k).
String -> PrimType -> ImpM rep r op (TV t)
dPrim String
"status" PrimType
int8 :: InKernelGen (TV Int8)
          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
status) [] (VName -> SubExp
Var VName
warpscan) [TExp Int64
0]

          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
            (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
status TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
forall a. Num a => a
statusP)
            ( TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                  [(TV Any, VName)]
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any] -> [VName] -> [(TV Any, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [TV Any]
prefixes [VName]
incprefixArrays) (((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(TV Any
prefix, VName
incprefixArray) ->
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
prefix) [] (VName -> SubExp
Var VName
incprefixArray) [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
            )
            ( do
                TV Int32
readOffset <-
                  String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TV Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"readOffset" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TV Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TV Int32)
forall a b. (a -> b) -> a -> b
$
                    TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 (TExp Int64 -> TExp Int32) -> TExp Int64 -> TExp Int32
forall a b. (a -> b) -> a -> b
$
                      TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (KernelConstants -> TExp Int32
kernelWaveSize KernelConstants
constants)
                let loopStop :: TExp Int32
loopStop = TExp Int32
warp_size TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* (-TExp Int32
1)
                    sameSegment :: TV Int32 -> TPrimExp Bool VName
sameSegment TV Int32
readIdx
                      | Bool
segmented =
                          let startIdx :: TExp Int64
startIdx = TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readIdx TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
1) TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1
                           in TExp Int64
block_offset TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
startIdx TExp Int64 -> TExp Int64 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<=. TExp Int64
sgm_idx
                      | Bool
otherwise = TPrimExp Bool VName
forall v. TPrimExp Bool v
true
                TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhile (TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readOffset TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.>. TExp Int32
loopStop) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                  TV Int32
readI <- String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TV Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"read_i" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TV Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TV Int32)
forall a b. (a -> b) -> a -> b
$ TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readOffset TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
ltid32
                  [TV Any]
aggrs <- [(SubExp, PrimType)]
-> ((SubExp, PrimType) -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> ImpM GPUMem KernelEnv KernelOp [TV Any]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM ([SubExp] -> [PrimType] -> [(SubExp, PrimType)]
forall a b. [a] -> [b] -> [(a, b)]
zip [SubExp]
scanop_nes [PrimType]
tys) (((SubExp, PrimType) -> ImpM GPUMem KernelEnv KernelOp (TV Any))
 -> ImpM GPUMem KernelEnv KernelOp [TV Any])
-> ((SubExp, PrimType) -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> ImpM GPUMem KernelEnv KernelOp [TV Any]
forall a b. (a -> b) -> a -> b
$ \(SubExp
ne, PrimType
ty) ->
                    String -> TExp Any -> ImpM GPUMem KernelEnv KernelOp (TV Any)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"aggr" (TExp Any -> ImpM GPUMem KernelEnv KernelOp (TV Any))
-> TExp Any -> ImpM GPUMem KernelEnv KernelOp (TV Any)
forall a b. (a -> b) -> a -> b
$ PrimExp VName -> TExp Any
forall {k} (t :: k) v. PrimExp v -> TPrimExp t v
TPrimExp (PrimExp VName -> TExp Any) -> PrimExp VName -> TExp Any
forall a b. (a -> b) -> a -> b
$ PrimType -> SubExp -> PrimExp VName
forall a. ToExp a => PrimType -> a -> PrimExp VName
toExp' PrimType
ty SubExp
ne
                  TV Int8
flag <- String
-> TPrimExp Int8 VName -> ImpM GPUMem KernelEnv KernelOp (TV Int8)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"flag" (TPrimExp Int8 VName
forall a. Num a => a
statusX :: Imp.TExp Int8)
                  ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (ImpM GPUMem KernelEnv KernelOp ()
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readI TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.>=. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                    TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
                      (TV Int32 -> TPrimExp Bool VName
sameSegment TV Int32
readI)
                      ( do
                          VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
flag) [] (VName -> SubExp
Var VName
statusFlags) [TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int32 -> TExp Int64) -> TExp Int32 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readI]
                          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
                            (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flag TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
forall a. Num a => a
statusP)
                            ( [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
incprefixArrays [TV Any]
aggrs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
incprefix, TV Any
aggr) ->
                                VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
aggr) [] (VName -> SubExp
Var VName
incprefix) [TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int32 -> TExp Int64) -> TExp Int32 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readI]
                            )
                            ( TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flag TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
forall a. Num a => a
statusA) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                                [(TV Any, VName)]
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any] -> [VName] -> [(TV Any, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [TV Any]
aggrs [VName]
aggregateArrays) (((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(TV Any
aggr, VName
aggregate) ->
                                  VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
aggr) [] (VName -> SubExp
Var VName
aggregate) [TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int32 -> TExp Int64) -> TExp Int32 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readI]
                            )
                      )
                      (VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
flag) [] (IntType -> Integer -> SubExp
intConst IntType
Int8 Integer
forall a. Num a => a
statusP) [])
                  -- end sIf
                  -- end sWhen

                  [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
exchanges [TV Any]
aggrs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
exchange, TV Any
aggr) ->
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
exchange [TExp Int64
ltid] (TV Any -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Any
aggr) []
                  VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
warpscan [TExp Int64
ltid] (TV Int8 -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Int8
flag) []

                  -- execute warp-parallel reduction but only if the last read flag in not STATUS_P
                  VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
flag) [] (VName -> SubExp
Var VName
warpscan) [TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int32
warp_size TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
                  TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flag TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TPrimExp Int8 VName
forall a. Num a => a
statusP) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                    Lambda GPUMem
lam' <- Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda GPUMem
scan_op1
                    KernelConstants
-> TExp Int64
-> VName
-> [VName]
-> Lambda GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
inBlockScanLookback
                      KernelConstants
constants
                      TExp Int64
num_virt_threads
                      VName
warpscan
                      [VName]
exchanges
                      Lambda GPUMem
lam'

                  -- all threads of the warp read the result of reduction
                  VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Int8 -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Int8
flag) [] (VName -> SubExp
Var VName
warpscan) [TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int32
warp_size TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
                  [(TV Any, VName)]
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any] -> [VName] -> [(TV Any, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [TV Any]
aggrs [VName]
exchanges) (((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(TV Any
aggr, VName
exchange) ->
                    VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
aggr) [] (VName -> SubExp
Var VName
exchange) [TExp Int32 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 TExp Int32
warp_size TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
1]
                  -- update read offset
                  TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
                    (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flag TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
forall a. Num a => a
statusP)
                    (TV Int32
readOffset TV Int32 -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- TExp Int32
loopStop)
                    ( TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flag TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TPrimExp Int8 VName
forall a. Num a => a
statusA) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                        TV Int32
readOffset TV Int32 -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- TV Int32 -> TExp Int32
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
readOffset TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
zExt32 TExp Int32
warp_size
                    )

                  -- update prefix if flag different than STATUS_X:
                  TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TV Int8 -> TPrimExp Int8 VName
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int8
flag TPrimExp Int8 VName -> TPrimExp Int8 VName -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.>. TPrimExp Int8 VName
forall a. Num a => a
statusX) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
                    Lambda GPUMem
lam <- Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda GPUMem
scan_op1
                    let ([VName]
xs, [VName]
ys) = Int -> [VName] -> ([VName], [VName])
forall a. Int -> [a] -> ([a], [a])
splitAt ([PrimType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [PrimType]
tys) ([VName] -> ([VName], [VName])) -> [VName] -> ([VName], [VName])
forall a b. (a -> b) -> a -> b
$ (Param (MemInfo SubExp NoUniqueness MemBind) -> VName)
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName])
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
lam
                    [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
xs [TV Any]
aggrs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
x, TV Any
aggr) -> VName -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. VName -> TExp t -> ImpM rep r op ()
dPrimV_ VName
x (TV Any -> TExp Any
forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
aggr)
                    [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
ys [TV Any]
prefixes) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
y, TV Any
prefix) -> VName -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. VName -> TExp t -> ImpM rep r op ()
dPrimV_ VName
y (TV Any -> TExp Any
forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
prefix)
                    Names
-> Stms GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
Names -> Stms rep -> ImpM rep r op () -> ImpM rep r op ()
compileStms Names
forall a. Monoid a => a
mempty (Body GPUMem -> Stms GPUMem
forall rep. Body rep -> Stms rep
bodyStms (Body GPUMem -> Stms GPUMem) -> Body GPUMem -> Stms GPUMem
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
lam) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                      [(TV Any, PrimType, SubExp)]
-> ((TV Any, PrimType, SubExp)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any] -> [PrimType] -> [SubExp] -> [(TV Any, PrimType, SubExp)]
forall a b c. [a] -> [b] -> [c] -> [(a, b, c)]
zip3 [TV Any]
prefixes [PrimType]
tys ([SubExp] -> [(TV Any, PrimType, SubExp)])
-> [SubExp] -> [(TV Any, PrimType, SubExp)]
forall a b. (a -> b) -> a -> b
$ (SubExpRes -> SubExp) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> [a] -> [b]
map SubExpRes -> SubExp
resSubExp ([SubExpRes] -> [SubExp]) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> a -> b
$ Body GPUMem -> [SubExpRes]
forall rep. Body rep -> [SubExpRes]
bodyResult (Body GPUMem -> [SubExpRes]) -> Body GPUMem -> [SubExpRes]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
lam) (((TV Any, PrimType, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, PrimType, SubExp)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                        \(TV Any
prefix, PrimType
ty, SubExp
res) -> TV Any
prefix TV Any -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- PrimExp VName -> TExp Any
forall {k} (t :: k) v. PrimExp v -> TPrimExp t v
TPrimExp (PrimType -> SubExp -> PrimExp VName
forall a. ToExp a => PrimType -> a -> PrimExp VName
toExp' PrimType
ty SubExp
res)
                  KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_fence
            )

          -- end sWhile
          -- end sIf
          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
            Lambda GPUMem
scan_op2 <- Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda GPUMem
scan_op1
            let xs :: [VName]
xs = (Param (MemInfo SubExp NoUniqueness MemBind) -> VName)
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName])
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> a -> b
$ Int
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. Int -> [a] -> [a]
take ([PrimType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [PrimType]
tys) ([Param (MemInfo SubExp NoUniqueness MemBind)]
 -> [Param (MemInfo SubExp NoUniqueness MemBind)])
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
scan_op2
                ys :: [VName]
ys = (Param (MemInfo SubExp NoUniqueness MemBind) -> VName)
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName])
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> a -> b
$ Int
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a. Int -> [a] -> [a]
drop ([PrimType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [PrimType]
tys) ([Param (MemInfo SubExp NoUniqueness MemBind)]
 -> [Param (MemInfo SubExp NoUniqueness MemBind)])
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
-> [Param (MemInfo SubExp NoUniqueness MemBind)]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
scan_op2
            TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int32
boundary TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 (TExp Int64
tblock_size_e TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk)) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
              [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
xs [TV Any]
prefixes) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
x, TV Any
prefix) -> VName -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. VName -> TExp t -> ImpM rep r op ()
dPrimV_ VName
x (TExp Any -> ImpM GPUMem KernelEnv KernelOp ())
-> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ TV Any -> TExp Any
forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
prefix
              [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
ys [TV Any]
accs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
y, TV Any
acc) -> VName -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. VName -> TExp t -> ImpM rep r op ()
dPrimV_ VName
y (TExp Any -> ImpM GPUMem KernelEnv KernelOp ())
-> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ TV Any -> TExp Any
forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
acc
              Names
-> Stms GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
Names -> Stms rep -> ImpM rep r op () -> ImpM rep r op ()
compileStms Names
forall a. Monoid a => a
mempty (Body GPUMem -> Stms GPUMem
forall rep. Body rep -> Stms rep
bodyStms (Body GPUMem -> Stms GPUMem) -> Body GPUMem -> Stms GPUMem
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_op2) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                  [(VName, SubExp)]
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [SubExp] -> [(VName, SubExp)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
incprefixArrays ([SubExp] -> [(VName, SubExp)]) -> [SubExp] -> [(VName, SubExp)]
forall a b. (a -> b) -> a -> b
$ (SubExpRes -> SubExp) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> [a] -> [b]
map SubExpRes -> SubExp
resSubExp ([SubExpRes] -> [SubExp]) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> a -> b
$ Body GPUMem -> [SubExpRes]
forall rep. Body rep -> [SubExpRes]
bodyResult (Body GPUMem -> [SubExpRes]) -> Body GPUMem -> [SubExpRes]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_op2) (((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                    \(VName
incprefixArray, SubExp
res) -> VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
incprefixArray [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] SubExp
res []
              KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
global_fence
              ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
statusFlags [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id] (IntType -> Integer -> SubExp
intConst IntType
Int8 Integer
forall a. Num a => a
statusP) []
            [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
exchanges [TV Any]
prefixes) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
exchange, TV Any
prefix) ->
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
exchange [TExp Int64
0] (TV Any -> SubExp
forall {k} (t :: k). TV t -> SubExp
tvSize TV Any
prefix) []
            [(TV Any, PrimType, SubExp)]
-> ((TV Any, PrimType, SubExp)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any] -> [PrimType] -> [SubExp] -> [(TV Any, PrimType, SubExp)]
forall a b c. [a] -> [b] -> [c] -> [(a, b, c)]
zip3 [TV Any]
accs [PrimType]
tys [SubExp]
scanop_nes) (((TV Any, PrimType, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, PrimType, SubExp)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(TV Any
acc, PrimType
ty, SubExp
ne) ->
              TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
acc VName -> PrimExp VName -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. VName -> PrimExp VName -> ImpM rep r op ()
<~~ PrimType -> SubExp -> PrimExp VName
forall a. ToExp a => PrimType -> a -> PrimExp VName
toExp' PrimType
ty SubExp
ne
        -- end sWhen
        -- end sWhen

        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TPrimExp Bool VName -> TPrimExp Bool VName
forall v. TPrimExp Bool v -> TPrimExp Bool v
bNot (TPrimExp Bool VName -> TPrimExp Bool VName)
-> TPrimExp Bool VName -> TPrimExp Bool VName
forall a b. (a -> b) -> a -> b
$ TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
dyn_id TExp Int64 -> TExp Int64 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int64
0) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier
          [(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
exchanges [TV Any]
prefixes) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
exchange, TV Any
prefix) ->
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
prefix) [] (VName -> SubExp
Var VName
exchange) [TExp Int64
0]
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier
      -- end sWhen
      -- end sComment

      Lambda GPUMem
scan_op3 <- Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda GPUMem
scan_op1
      Lambda GPUMem
scan_op4 <- Lambda GPUMem -> ImpM GPUMem KernelEnv KernelOp (Lambda GPUMem)
forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda GPUMem
scan_op1

      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Distribute results" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        let ([VName]
xs, [VName]
ys) = Int -> [VName] -> ([VName], [VName])
forall a. Int -> [a] -> ([a], [a])
splitAt ([PrimType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [PrimType]
tys) ([VName] -> ([VName], [VName])) -> [VName] -> ([VName], [VName])
forall a b. (a -> b) -> a -> b
$ (Param (MemInfo SubExp NoUniqueness MemBind) -> VName)
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName])
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
scan_op3
            ([VName]
xs', [VName]
ys') = Int -> [VName] -> ([VName], [VName])
forall a. Int -> [a] -> ([a], [a])
splitAt ([PrimType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [PrimType]
tys) ([VName] -> ([VName], [VName])) -> [VName] -> ([VName], [VName])
forall a b. (a -> b) -> a -> b
$ (Param (MemInfo SubExp NoUniqueness MemBind) -> VName)
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map Param (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. Param dec -> VName
paramName ([Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName])
-> [Param (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> [LParam GPUMem]
forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda GPUMem
scan_op4

        [(TV Any, TV Any, VName, VName, VName, VName, PrimType)]
-> ((TV Any, TV Any, VName, VName, VName, VName, PrimType)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([TV Any]
-> [TV Any]
-> [VName]
-> [VName]
-> [VName]
-> [VName]
-> [PrimType]
-> [(TV Any, TV Any, VName, VName, VName, VName, PrimType)]
forall a b c d e f g.
[a]
-> [b]
-> [c]
-> [d]
-> [e]
-> [f]
-> [g]
-> [(a, b, c, d, e, f, g)]
zip7 [TV Any]
prefixes [TV Any]
accs [VName]
xs [VName]
xs' [VName]
ys [VName]
ys' [PrimType]
tys) (((TV Any, TV Any, VName, VName, VName, VName, PrimType)
  -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((TV Any, TV Any, VName, VName, VName, VName, PrimType)
    -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
          \(TV Any
prefix, TV Any
acc, VName
x, VName
x', VName
y, VName
y', PrimType
ty) -> do
            VName -> PrimType -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. VName -> PrimType -> ImpM rep r op ()
dPrim_ VName
x PrimType
ty
            VName -> PrimType -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. VName -> PrimType -> ImpM rep r op ()
dPrim_ VName
y PrimType
ty
            VName -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. VName -> TExp t -> ImpM rep r op ()
dPrimV_ VName
x' (TExp Any -> ImpM GPUMem KernelEnv KernelOp ())
-> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ TV Any -> TExp Any
forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
prefix
            VName -> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op. VName -> TExp t -> ImpM rep r op ()
dPrimV_ VName
y' (TExp Any -> ImpM GPUMem KernelEnv KernelOp ())
-> TExp Any -> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ TV Any -> TExp Any
forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
acc

        TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName
-> ImpM rep r op () -> ImpM rep r op () -> ImpM rep r op ()
sIf
          (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
chunk32 TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TExp Int32
boundary TPrimExp Bool VName -> TPrimExp Bool VName -> TPrimExp Bool VName
forall v.
Eq v =>
TPrimExp Bool v -> TPrimExp Bool v -> TPrimExp Bool v
.&&. TPrimExp Bool VName -> TPrimExp Bool VName
forall v. TPrimExp Bool v -> TPrimExp Bool v
bNot TPrimExp Bool VName
blockNewSgm)
          ( Names
-> Stms GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
Names -> Stms rep -> ImpM rep r op () -> ImpM rep r op ()
compileStms Names
forall a. Monoid a => a
mempty (Body GPUMem -> Stms GPUMem
forall rep. Body rep -> Stms rep
bodyStms (Body GPUMem -> Stms GPUMem) -> Body GPUMem -> Stms GPUMem
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_op4) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
              [(VName, PrimType, SubExp)]
-> ((VName, PrimType, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [PrimType] -> [SubExp] -> [(VName, PrimType, SubExp)]
forall a b c. [a] -> [b] -> [c] -> [(a, b, c)]
zip3 [VName]
xs [PrimType]
tys ([SubExp] -> [(VName, PrimType, SubExp)])
-> [SubExp] -> [(VName, PrimType, SubExp)]
forall a b. (a -> b) -> a -> b
$ (SubExpRes -> SubExp) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> [a] -> [b]
map SubExpRes -> SubExp
resSubExp ([SubExpRes] -> [SubExp]) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> a -> b
$ Body GPUMem -> [SubExpRes]
forall rep. Body rep -> [SubExpRes]
bodyResult (Body GPUMem -> [SubExpRes]) -> Body GPUMem -> [SubExpRes]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_op4) (((VName, PrimType, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, PrimType, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                \(VName
x, PrimType
ty, SubExp
res) -> VName
x VName -> PrimExp VName -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. VName -> PrimExp VName -> ImpM rep r op ()
<~~ PrimType -> SubExp -> PrimExp VName
forall a. ToExp a => PrimType -> a -> PrimExp VName
toExp' PrimType
ty SubExp
res
          )
          ([(VName, TV Any)]
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [TV Any] -> [(VName, TV Any)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
xs [TV Any]
accs) (((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, TV Any) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
x, TV Any
acc) -> VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
x [] (VName -> SubExp
Var (VName -> SubExp) -> VName -> SubExp
forall a b. (a -> b) -> a -> b
$ TV Any -> VName
forall {k} (t :: k). TV t -> VName
tvVar TV Any
acc) [])
        -- calculate where previous thread stopped, to determine number of
        -- elements left before new segment.
        TExp Int32
stop <-
          String -> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"stopping_point" (TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32))
-> TExp Int32 -> ImpM GPUMem KernelEnv KernelOp (TExp Int32)
forall a b. (a -> b) -> a -> b
$
            TExp Int32
segsize_compact TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- (TExp Int32
ltid32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
* TExp Int32
chunk32 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
1 TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
+ TExp Int32
segsize_compact TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
boundary) TExp Int32 -> TExp Int32 -> TExp Int32
forall e. IntegralExp e => e -> e -> e
`rem` TExp Int32
segsize_compact
        String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" TExp Int64
chunk ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
          TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int64 -> TExp Int32
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 TExp Int64
i TExp Int32 -> TExp Int32 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TExp Int32
stop TExp Int32 -> TExp Int32 -> TExp Int32
forall a. Num a => a -> a -> a
- TExp Int32
1) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
            [(VName, VName)]
-> ((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [VName] -> [(VName, VName)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
private_chunks [VName]
ys) (((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
src, VName
y) ->
              -- only include prefix for the first segment part per thread
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
y [] (VName -> SubExp
Var VName
src) [TExp Int64
i]
            Names
-> Stms GPUMem
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
Names -> Stms rep -> ImpM rep r op () -> ImpM rep r op ()
compileStms Names
forall a. Monoid a => a
mempty (Body GPUMem -> Stms GPUMem
forall rep. Body rep -> Stms rep
bodyStms (Body GPUMem -> Stms GPUMem) -> Body GPUMem -> Stms GPUMem
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_op3) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
              [(VName, SubExp)]
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [SubExp] -> [(VName, SubExp)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
private_chunks ([SubExp] -> [(VName, SubExp)]) -> [SubExp] -> [(VName, SubExp)]
forall a b. (a -> b) -> a -> b
$ (SubExpRes -> SubExp) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> [a] -> [b]
map SubExpRes -> SubExp
resSubExp ([SubExpRes] -> [SubExp]) -> [SubExpRes] -> [SubExp]
forall a b. (a -> b) -> a -> b
$ Body GPUMem -> [SubExpRes]
forall rep. Body rep -> [SubExpRes]
bodyResult (Body GPUMem -> [SubExpRes]) -> Body GPUMem -> [SubExpRes]
forall a b. (a -> b) -> a -> b
$ Lambda GPUMem -> Body GPUMem
forall rep. Lambda rep -> Body rep
lambdaBody Lambda GPUMem
scan_op3) (((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, SubExp) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$
                \(VName
dest, SubExp
res) ->
                  VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
dest [TExp Int64
i] SubExp
res []

      Text
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"Transpose scan output and Write it to global memory in coalesced fashion" (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
        [(VName, VName, VName)]
-> ((VName, VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ ([VName] -> [VName] -> [VName] -> [(VName, VName, VName)]
forall a b c. [a] -> [b] -> [c] -> [(a, b, c)]
zip3 [VName]
transposedArrays [VName]
private_chunks ([VName] -> [(VName, VName, VName)])
-> [VName] -> [(VName, VName, VName)]
forall a b. (a -> b) -> a -> b
$ (PatElem (MemInfo SubExp NoUniqueness MemBind) -> VName)
-> [PatElem (MemInfo SubExp NoUniqueness MemBind)] -> [VName]
forall a b. (a -> b) -> [a] -> [b]
map PatElem (MemInfo SubExp NoUniqueness MemBind) -> VName
forall dec. PatElem dec -> VName
patElemName [PatElem (MemInfo SubExp NoUniqueness MemBind)]
all_pes) (((VName, VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ((VName, VName, VName) -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \(VName
locmem, VName
priv, VName
dest) -> do
          -- sOp local_barrier
          String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" TExp Int64
chunk ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
            TV Int64
sharedIdx <-
              String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TV t)
dPrimV String
"sharedIdx" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TV Int64)
forall a b. (a -> b) -> a -> b
$
                TExp Int64 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int64
ltid TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
chunk) TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
i
            VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
locmem [TV Int64 -> TExp Int64
forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
sharedIdx] (VName -> SubExp
Var VName
priv) [TExp Int64
i]
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier
          String
-> TExp Int64
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall {k} (t :: k) rep r op.
String
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor String
"i" TExp Int64
chunk ((TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
 -> ImpM GPUMem KernelEnv KernelOp ())
-> (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> do
            TExp Int64
flat_idx <- String -> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall {k} (t :: k) rep r op.
String -> TExp t -> ImpM rep r op (TExp t)
dPrimVE String
"flat_idx" (TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64))
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp (TExp Int64)
forall a b. (a -> b) -> a -> b
$ TExp Int64
thd_offset TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
+ TExp Int64
i TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
* TExp Int64
tblock_size_e
            [(VName, TExp Int64)]
-> TExp Int64 -> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
[(VName, TExp Int64)] -> TExp Int64 -> ImpM rep r op ()
dIndexSpace ([VName] -> [TExp Int64] -> [(VName, TExp Int64)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
gtids [TExp Int64]
dims') TExp Int64
flat_idx
            TPrimExp Bool VName
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
TPrimExp Bool VName -> ImpM rep r op () -> ImpM rep r op ()
sWhen (TExp Int64
flat_idx TExp Int64 -> TExp Int64 -> TPrimExp Bool VName
forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.<. TExp Int64
n) (ImpM GPUMem KernelEnv KernelOp ()
 -> ImpM GPUMem KernelEnv KernelOp ())
-> ImpM GPUMem KernelEnv KernelOp ()
-> ImpM GPUMem KernelEnv KernelOp ()
forall a b. (a -> b) -> a -> b
$ do
              VName
-> [TExp Int64]
-> SubExp
-> [TExp Int64]
-> ImpM GPUMem KernelEnv KernelOp ()
forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix
                VName
dest
                ((VName -> TExp Int64) -> [VName] -> [TExp Int64]
forall a b. (a -> b) -> [a] -> [b]
map VName -> TExp Int64
forall a. a -> TPrimExp Int64 a
Imp.le64 [VName]
gtids)
                (VName -> SubExp
Var VName
locmem)
                [TExp Int64 -> TExp Int64
forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int64 v
sExt64 (TExp Int64 -> TExp Int64) -> TExp Int64 -> TExp Int64
forall a b. (a -> b) -> a -> b
$ TExp Int64
flat_idx TExp Int64 -> TExp Int64 -> TExp Int64
forall a. Num a => a -> a -> a
- TExp Int64
block_offset]
          KernelOp -> ImpM GPUMem KernelEnv KernelOp ()
forall op rep r. op -> ImpM rep r op ()
sOp KernelOp
local_barrier
{-# NOINLINE compileSegScan #-}