{-# LANGUAGE TypeFamilies #-} -- | We generate code for non-segmented/single-segment SegRed using -- the basic approach outlined in the paper "Design and GPGPU -- Performance of Futhark’s Redomap Construct" (ARRAY '16). The main -- deviations are: -- -- * While we still use two-phase reduction, we use only a single -- kernel, with the final threadblock to write a result (tracked via -- an atomic counter) performing the final reduction as well. -- -- * Instead of depending on storage layout transformations to handle -- non-commutative reductions efficiently, we slide a -- @tblocksize@-sized window over the input, and perform a parallel -- reduction for each window. This sacrifices the notion of -- efficient sequentialisation, but is sometimes faster and -- definitely simpler and more predictable (and uses less auxiliary -- storage). -- -- For segmented reductions we use the approach from "Strategies for -- Regular Segmented Reductions on GPU" (FHPC '17). This involves -- having two different strategies, and dynamically deciding which one -- to use based on the number of segments and segment size. We use the -- (static) @tblock_size@ to decide which of the following two -- strategies to choose: -- -- * Large: uses one or more blocks to process a single segment. If -- multiple blocks are used per segment, the intermediate reduction -- results must be recursively reduced, until there is only a single -- value per segment. -- -- Each thread /can/ read multiple elements, which will greatly -- increase performance; however, if the reduction is -- non-commutative we will have to use a less efficient traversal -- (with interim block-wide reductions) to enable coalesced memory -- accesses, just as in the non-segmented case. -- -- * Small: is used to let each block process *multiple* segments -- within a block. We will only use this approach when we can -- process at least two segments within a single block. In those -- cases, we would allocate a /whole/ block per segment with the -- large strategy, but at most 50% of the threads in the block would -- have any element to read, which becomes highly inefficient. -- -- An optimization specfically targeted at non-segmented and large-segments -- segmented reductions with non-commutative is made: The stage one main loop is -- essentially stripmined by a factor *chunk*, inserting collective copies via -- local memory of each reduction parameter going into the intra-block (partial) -- reductions. This saves a factor *chunk* number of intra-block reductions at -- the cost of some overhead in collective copies. module Futhark.CodeGen.ImpGen.GPU.SegRed ( compileSegRed, compileSegRed', DoSegBody, ) where import Control.Monad import Data.List (genericLength, zip4) import Data.Map qualified as M import Data.Maybe import Futhark.CodeGen.ImpCode.GPU qualified as Imp import Futhark.CodeGen.ImpGen import Futhark.CodeGen.ImpGen.GPU.Base import Futhark.Error import Futhark.IR.GPUMem import Futhark.IR.Mem.LMAD qualified as LMAD import Futhark.Transform.Rename import Futhark.Util (chunks, mapAccumLM) import Futhark.Util.IntegralExp (divUp, nextMul, quot, rem) import Prelude hiding (quot, rem) forM2_ :: (Monad m) => [a] -> [b] -> (a -> b -> m c) -> m () forM2_ xs ys f = forM_ (zip xs ys) (uncurry f) -- | The maximum number of operators we support in a single SegRed. -- This limit arises out of the static allocation of counters. maxNumOps :: Int maxNumOps = 20 -- | Code generation for the body of the SegRed, taking a continuation -- for saving the results of the body. The results should be -- represented as a pairing of a t'SubExp' along with a list of -- indexes into that t'SubExp' for reading the result. type DoSegBody = ([(SubExp, [Imp.TExp Int64])] -> InKernelGen ()) -> InKernelGen () -- | Datatype used to distinguish between and work with the different sets of -- intermediate memory we need for the different ReduceKinds. data SegRedIntermediateArrays = GeneralSegRedInterms { blockRedArrs :: [VName] } | NoncommPrimSegRedInterms { collCopyArrs :: [VName], blockRedArrs :: [VName], privateChunks :: [VName] } -- | Compile 'SegRed' instance to host-level code with calls to -- various kernels. compileSegRed :: Pat LetDecMem -> SegLevel -> SegSpace -> [SegBinOp GPUMem] -> KernelBody GPUMem -> CallKernelGen () compileSegRed pat lvl space segbinops map_kbody = do emit $ Imp.DebugPrint "\n# SegRed" Nothing KernelAttrs {kAttrNumBlocks = num_tblocks, kAttrBlockSize = tblock_size} <- lvlKernelAttrs lvl let grid = KernelGrid num_tblocks tblock_size compileSegRed' pat grid space segbinops $ \red_cont -> sComment "apply map function" $ compileStms mempty (kernelBodyStms map_kbody) $ do let (red_res, map_res) = splitAt (segBinOpResults segbinops) $ kernelBodyResult map_kbody let mapout_arrs = drop (segBinOpResults segbinops) $ patElems pat unless (null mapout_arrs) $ sComment "write map-out result(s)" $ do zipWithM_ (compileThreadResult space) mapout_arrs map_res red_cont $ map ((,[]) . kernelResultSubExp) red_res emit $ Imp.DebugPrint "" Nothing paramOf :: SegBinOp GPUMem -> [Param LParamMem] paramOf (SegBinOp _ op ne _) = take (length ne) $ lambdaParams op isPrimSegBinOp :: SegBinOp GPUMem -> Bool isPrimSegBinOp segbinop = all primType (lambdaReturnType $ segBinOpLambda segbinop) && shapeRank (segBinOpShape segbinop) == 0 -- | Like 'compileSegRed', but where the body is a monadic action. compileSegRed' :: Pat LetDecMem -> KernelGrid -> SegSpace -> [SegBinOp GPUMem] -> DoSegBody -> CallKernelGen () compileSegRed' pat grid space segbinops map_body_cont | genericLength segbinops > maxNumOps = compilerLimitationS $ ("compileSegRed': at most " <> show maxNumOps <> " reduction operators are supported.\n") <> ("Pattern: " <> prettyString pat) | otherwise = do chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const case unSegSpace space of [(_, Constant (IntValue (Int64Value 1))), _] -> compileReduction (chunk_v, chunk_const) nonsegmentedReduction _ -> do let segment_size = pe64 $ last $ segSpaceDims space use_small_segments = segment_size * 2 .<. pe64 (unCount tblock_size) * tvExp chunk_v sIf use_small_segments (compileReduction (chunk_v, chunk_const) smallSegmentsReduction) (compileReduction (chunk_v, chunk_const) largeSegmentsReduction) where compileReduction chunk f = f pat num_tblocks tblock_size chunk space segbinops map_body_cont param_types = map paramType $ concatMap paramOf segbinops num_tblocks = gridNumBlocks grid tblock_size = gridBlockSize grid chunk_const = if Noncommutative `elem` map segBinOpComm segbinops && all isPrimSegBinOp segbinops then getChunkSize param_types else Imp.ValueExp $ IntValue $ intValue Int64 (1 :: Int64) -- | Prepare intermediate arrays for the reduction. Prim-typed -- arguments go in local memory (so we need to do the allocation of -- those arrays inside the kernel), while array-typed arguments go in -- global memory. Allocations for the latter have already been -- performed. This policy is baked into how the allocations are done -- in ExplicitAllocations. -- -- For more info about the intermediate arrays used for the different reduction -- kernels, see note [IntermArrays]. makeIntermArrays :: Imp.TExp Int64 -> SubExp -> SubExp -> [SegBinOp GPUMem] -> InKernelGen [SegRedIntermediateArrays] makeIntermArrays tblock_id tblock_size chunk segbinops | Noncommutative <- mconcat (map segBinOpComm segbinops), all isPrimSegBinOp segbinops = noncommPrimSegRedInterms | otherwise = generalSegRedInterms tblock_id tblock_size segbinops where params = map paramOf segbinops noncommPrimSegRedInterms = do block_worksize <- tvSize <$> dPrimV "block_worksize" block_worksize_E -- compute total amount of lmem. let sum_ x y = nextMul x y + tblock_size_E * y block_reds_lmem_requirement = foldl sum_ 0 $ concat elem_sizes collcopy_lmem_requirement = block_worksize_E * max_elem_size lmem_total_size = Imp.bytes $ collcopy_lmem_requirement `sMax64` block_reds_lmem_requirement -- offsets into the total pool of lmem for each block reduction array. (_, offsets) <- forAccumLM2D 0 elem_sizes $ \byte_offs elem_size -> (,byte_offs `quot` elem_size) <$> dPrimVE "offset" (sum_ byte_offs elem_size) -- total pool of local mem. lmem <- sAlloc "local_mem" lmem_total_size (Space "shared") let arrInLMem ptype name len_se offset = sArray (name ++ "_" ++ prettyString ptype) ptype (Shape [len_se]) lmem $ LMAD.iota offset [pe64 len_se] forM (zipWith zip params offsets) $ \ps_and_offsets -> do (coll_copy_arrs, block_red_arrs, priv_chunks) <- fmap unzip3 $ forM ps_and_offsets $ \(p, offset) -> do let ptype = elemType $ paramType p (,,) <$> arrInLMem ptype "coll_copy_arr" block_worksize 0 <*> arrInLMem ptype "block_red_arr" tblock_size offset <*> sAllocArray ("chunk_" ++ prettyString ptype) ptype (Shape [chunk]) (ScalarSpace [chunk] ptype) pure $ NoncommPrimSegRedInterms coll_copy_arrs block_red_arrs priv_chunks tblock_size_E = pe64 tblock_size block_worksize_E = tblock_size_E * pe64 chunk paramSize = primByteSize . elemType . paramType elem_sizes = map (map paramSize) params max_elem_size = maximum $ concat elem_sizes forAccumLM2D acc ls f = mapAccumLM (mapAccumLM f) acc ls generalSegRedInterms :: Imp.TExp Int64 -> SubExp -> [SegBinOp GPUMem] -> InKernelGen [SegRedIntermediateArrays] generalSegRedInterms tblock_id tblock_size segbinops = fmap (map GeneralSegRedInterms) $ forM (map paramOf segbinops) $ mapM $ \p -> case paramDec p of MemArray pt shape _ (ArrayIn mem _) -> do let shape' = Shape [tblock_size] <> shape let shape_E = map pe64 $ shapeDims shape' sArray ("red_arr_" ++ prettyString pt) pt shape' mem $ LMAD.iota (tblock_id * product shape_E) shape_E _ -> do let pt = elemType $ paramType p shape = Shape [tblock_size] sAllocArray ("red_arr_" ++ prettyString pt) pt shape $ Space "shared" -- | Arrays for storing block results. -- -- The block-result arrays have an extra dimension because they are -- also used for keeping vectorised accumulators for first-stage -- reduction, if necessary. If necessary, this dimension has size -- tblock_size, and otherwise 1. When actually storing block results, -- the first index is set to 0. groupResultArrays :: SubExp -> SubExp -> [SegBinOp GPUMem] -> CallKernelGen [[VName]] groupResultArrays num_virtblocks tblock_size segbinops = forM segbinops $ \(SegBinOp _ lam _ shape) -> forM (lambdaReturnType lam) $ \t -> do let pt = elemType t extra_dim | primType t, shapeRank shape == 0 = intConst Int64 1 | otherwise = tblock_size full_shape = Shape [extra_dim, num_virtblocks] <> shape <> arrayShape t -- Move the tblocksize dimension last to ensure coalesced -- memory access. perm = [1 .. shapeRank full_shape - 1] ++ [0] sAllocArrayPerm "segred_tmp" pt full_shape (Space "device") perm type DoCompileSegRed = Pat LetDecMem -> Count NumBlocks SubExp -> Count BlockSize SubExp -> (TV Int64, Imp.KernelConstExp) -> SegSpace -> [SegBinOp GPUMem] -> DoSegBody -> CallKernelGen () nonsegmentedReduction :: DoCompileSegRed nonsegmentedReduction (Pat segred_pes) num_tblocks tblock_size (chunk_v, chunk_const) space segbinops map_body_cont = do let (gtids, dims) = unzip $ unSegSpace space chunk = tvExp chunk_v num_tblocks_se = unCount num_tblocks tblock_size_se = unCount tblock_size tblock_size' = pe64 tblock_size_se global_tid = Imp.le64 $ segFlat space n = pe64 $ last dims counters <- genZeroes "counters" maxNumOps reds_block_res_arrs <- groupResultArrays num_tblocks_se tblock_size_se segbinops num_threads <- fmap tvSize $ dPrimV "num_threads" $ pe64 num_tblocks_se * tblock_size' let attrs = (defKernelAttrs num_tblocks tblock_size) { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const } sKernelThread "segred_nonseg" (segFlat space) attrs $ do constants <- kernelConstants <$> askEnv let ltid = kernelLocalThreadId constants let tblock_id = kernelBlockId constants interms <- makeIntermArrays (sExt64 tblock_id) tblock_size_se (tvSize chunk_v) segbinops sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "shared" -- Since this is the nonsegmented case, all outer segment IDs must -- necessarily be 0. forM_ gtids $ \v -> dPrimV_ v (0 :: Imp.TExp Int64) q <- dPrimVE "q" $ n `divUp` (sExt64 (kernelNumThreads constants) * chunk) slugs <- mapM (segBinOpSlug ltid tblock_id) $ zip3 segbinops interms reds_block_res_arrs new_lambdas <- reductionStageOne gtids n global_tid q chunk (pe64 num_threads) slugs map_body_cont let segred_pess = chunks (map (length . segBinOpNeutral) segbinops) segred_pes forM_ (zip4 segred_pess slugs new_lambdas [0 ..]) $ \(pes, slug, new_lambda, i) -> reductionStageTwo pes tblock_id [0] 0 (sExt64 $ kernelNumBlocks constants) slug new_lambda counters sync_arr (fromInteger i) smallSegmentsReduction :: DoCompileSegRed smallSegmentsReduction (Pat segred_pes) num_tblocks tblock_size _ space segbinops map_body_cont = do let (gtids, dims) = unzip $ unSegSpace space dims' = map pe64 dims segment_size = last dims' -- Careful to avoid division by zero now. segment_size_nonzero <- dPrimVE "segment_size_nonzero" $ sMax64 1 segment_size let tblock_size_se = unCount tblock_size num_tblocks_se = unCount tblock_size num_tblocks' = pe64 num_tblocks_se tblock_size' = pe64 tblock_size_se num_threads <- fmap tvSize $ dPrimV "num_threads" $ num_tblocks' * tblock_size' let num_segments = product $ init dims' segments_per_block = tblock_size' `quot` segment_size_nonzero required_blocks = sExt32 $ num_segments `divUp` segments_per_block emit $ Imp.DebugPrint "# SegRed-small" Nothing emit $ Imp.DebugPrint "num_segments" $ Just $ untyped num_segments emit $ Imp.DebugPrint "segment_size" $ Just $ untyped segment_size emit $ Imp.DebugPrint "segments_per_block" $ Just $ untyped segments_per_block emit $ Imp.DebugPrint "required_blocks" $ Just $ untyped required_blocks sKernelThread "segred_small" (segFlat space) (defKernelAttrs num_tblocks tblock_size) $ do constants <- kernelConstants <$> askEnv let tblock_id = kernelBlockSize constants ltid = sExt64 $ kernelLocalThreadId constants interms <- generalSegRedInterms tblock_id tblock_size_se segbinops let reds_arrs = map blockRedArrs interms -- We probably do not have enough actual threadblocks to cover the -- entire iteration space. Some blocks thus have to perform double -- duty; we put an outer loop to accomplish this. virtualiseBlocks SegVirt required_blocks $ \virttblock_id -> do -- Compute the 'n' input indices. The outer 'n-1' correspond to -- the segment ID, and are computed from the block id. The inner -- is computed from the local thread id, and may be out-of-bounds. let segment_index = (ltid `quot` segment_size_nonzero) + (sExt64 virttblock_id * sExt64 segments_per_block) index_within_segment = ltid `rem` segment_size dIndexSpace (zip (init gtids) (init dims')) segment_index dPrimV_ (last gtids) index_within_segment let in_bounds = map_body_cont $ \red_res -> sComment "save results to be reduced" $ do let red_dests = map (,[ltid]) (concat reds_arrs) forM2_ red_dests red_res $ \(d, d_is) (res, res_is) -> copyDWIMFix d d_is res res_is out_of_bounds = forM2_ segbinops reds_arrs $ \(SegBinOp _ _ nes _) red_arrs -> forM2_ red_arrs nes $ \arr ne -> copyDWIMFix arr [ltid] ne [] sComment "apply map function if in bounds" $ sIf ( segment_size .>. 0 .&&. isActive (init $ zip gtids dims) .&&. ltid .<. segment_size * segments_per_block ) in_bounds out_of_bounds sOp $ Imp.ErrorSync Imp.FenceLocal -- Also implicitly barrier. let crossesSegment from to = (sExt64 to - sExt64 from) .>. (sExt64 to `rem` segment_size) sWhen (segment_size .>. 0) $ sComment "perform segmented scan to imitate reduction" $ forM2_ segbinops reds_arrs $ \(SegBinOp _ red_op _ _) red_arrs -> blockScan (Just crossesSegment) (sExt64 $ pe64 num_threads) (segment_size * segments_per_block) red_op red_arrs sOp $ Imp.Barrier Imp.FenceLocal sComment "save final values of segments" $ sWhen ( sExt64 virttblock_id * segments_per_block + sExt64 ltid .<. num_segments .&&. ltid .<. segments_per_block ) $ forM2_ segred_pes (concat reds_arrs) $ \pe arr -> do -- Figure out which segment result this thread should write... let flat_segment_index = sExt64 virttblock_id * segments_per_block + sExt64 ltid gtids' = unflattenIndex (init dims') flat_segment_index copyDWIMFix (patElemName pe) gtids' (Var arr) [(ltid + 1) * segment_size_nonzero - 1] -- Finally another barrier, because we will be writing to the -- local memory array first thing in the next iteration. sOp $ Imp.Barrier Imp.FenceLocal largeSegmentsReduction :: DoCompileSegRed largeSegmentsReduction (Pat segred_pes) num_tblocks tblock_size (chunk_v, chunk_const) space segbinops map_body_cont = do let (gtids, dims) = unzip $ unSegSpace space dims' = map pe64 dims num_segments = product $ init dims' segment_size = last dims' num_tblocks' = pe64 $ unCount num_tblocks tblock_size_se = unCount tblock_size tblock_size' = pe64 tblock_size_se chunk = tvExp chunk_v blocks_per_segment <- dPrimVE "blocks_per_segment" $ num_tblocks' `divUp` sMax64 1 num_segments q <- dPrimVE "q" $ segment_size `divUp` (tblock_size' * blocks_per_segment * chunk) num_virtblocks <- dPrimV "num_virtblocks" $ blocks_per_segment * num_segments threads_per_segment <- dPrimVE "threads_per_segment" $ blocks_per_segment * tblock_size' emit $ Imp.DebugPrint "# SegRed-large" Nothing emit $ Imp.DebugPrint "num_segments" $ Just $ untyped num_segments emit $ Imp.DebugPrint "segment_size" $ Just $ untyped segment_size emit $ Imp.DebugPrint "num_virtblocks" $ Just $ untyped $ tvExp num_virtblocks emit $ Imp.DebugPrint "num_tblocks" $ Just $ untyped num_tblocks' emit $ Imp.DebugPrint "tblock_size" $ Just $ untyped tblock_size' emit $ Imp.DebugPrint "q" $ Just $ untyped q emit $ Imp.DebugPrint "blocks_per_segment" $ Just $ untyped blocks_per_segment reds_block_res_arrs <- groupResultArrays (tvSize num_virtblocks) tblock_size_se segbinops -- In principle we should have a counter for every segment. Since -- the number of segments is a dynamic quantity, we would have to -- allocate and zero out an array here, which is expensive. -- However, we exploit the fact that the number of segments being -- reduced at any point in time is limited by the number of -- threadblocks. If we bound the number of threadblocks, we can get away -- with using that many counters. FIXME: Is this limit checked -- anywhere? There are other places in the compiler that will fail -- if the block count exceeds the maximum block size, which is at -- most 1024 anyway. let num_counters = maxNumOps * 1024 counters <- genZeroes "counters" $ fromIntegral num_counters let attrs = (defKernelAttrs num_tblocks tblock_size) { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const } sKernelThread "segred_large" (segFlat space) attrs $ do constants <- kernelConstants <$> askEnv let tblock_id = sExt64 $ kernelBlockId constants ltid = kernelLocalThreadId constants interms <- makeIntermArrays tblock_id tblock_size_se (tvSize chunk_v) segbinops sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "shared" -- We probably do not have enough actual threadblocks to cover the -- entire iteration space. Some blocks thus have to perform double -- duty; we put an outer loop to accomplish this. virtualiseBlocks SegVirt (sExt32 (tvExp num_virtblocks)) $ \virttblock_id -> do let segment_gtids = init gtids flat_segment_id <- dPrimVE "flat_segment_id" $ sExt64 virttblock_id `quot` blocks_per_segment global_tid <- dPrimVE "global_tid" $ (sExt64 virttblock_id * sExt64 tblock_size' + sExt64 ltid) `rem` threads_per_segment let first_block_for_segment = flat_segment_id * blocks_per_segment dIndexSpace (zip segment_gtids (init dims')) flat_segment_id dPrim_ (last gtids) int64 let n = pe64 $ last dims slugs <- mapM (segBinOpSlug ltid virttblock_id) $ zip3 segbinops interms reds_block_res_arrs new_lambdas <- reductionStageOne gtids n global_tid q chunk threads_per_segment slugs map_body_cont let segred_pess = chunks (map (length . segBinOpNeutral) segbinops) segred_pes multiple_blocks_per_segment = forM_ (zip4 segred_pess slugs new_lambdas [0 ..]) $ \(pes, slug, new_lambda, i) -> do let counter_idx = fromIntegral (i * num_counters) + flat_segment_id `rem` fromIntegral num_counters reductionStageTwo pes virttblock_id (map Imp.le64 segment_gtids) first_block_for_segment blocks_per_segment slug new_lambda counters sync_arr counter_idx one_block_per_segment = sComment "first thread in block saves final result to memory" $ forM2_ slugs segred_pess $ \slug pes -> sWhen (ltid .==. 0) $ forM2_ pes (slugAccs slug) $ \v (acc, acc_is) -> copyDWIMFix (patElemName v) (map Imp.le64 segment_gtids) (Var acc) acc_is sIf (blocks_per_segment .==. 1) one_block_per_segment multiple_blocks_per_segment -- | Auxiliary information for a single reduction. A slug holds the `SegBinOp` -- operator for a single reduction, the different arrays required throughout -- stages one and two, and a global mem destination for the final result of the -- particular reduction. data SegBinOpSlug = SegBinOpSlug { slugOp :: SegBinOp GPUMem, -- | Intermediate arrays needed for the given reduction. slugInterms :: SegRedIntermediateArrays, -- | Place(s) to store block accumulator(s) in stage 1 reduction. slugAccs :: [(VName, [Imp.TExp Int64])], -- | Global memory destination(s) for the final result(s) for this -- particular reduction. blockResArrs :: [VName] } segBinOpSlug :: Imp.TExp Int32 -> Imp.TExp Int32 -> (SegBinOp GPUMem, SegRedIntermediateArrays, [VName]) -> InKernelGen SegBinOpSlug segBinOpSlug ltid tblock_id (op, interms, block_res_arrs) = do accs <- zipWithM mkAcc (lambdaParams (segBinOpLambda op)) block_res_arrs pure $ SegBinOpSlug op interms accs block_res_arrs where mkAcc p block_res_arr | Prim t <- paramType p, shapeRank (segBinOpShape op) == 0 = do block_res_acc <- dPrim (baseString (paramName p) <> "_block_res_acc") t pure (tvVar block_res_acc, []) -- if this is a non-primitive reduction, the global mem result array will -- double as accumulator. | otherwise = pure (block_res_arr, [sExt64 ltid, sExt64 tblock_id]) slugLambda :: SegBinOpSlug -> Lambda GPUMem slugLambda = segBinOpLambda . slugOp slugBody :: SegBinOpSlug -> Body GPUMem slugBody = lambdaBody . slugLambda slugParams :: SegBinOpSlug -> [LParam GPUMem] slugParams = lambdaParams . slugLambda slugNeutral :: SegBinOpSlug -> [SubExp] slugNeutral = segBinOpNeutral . slugOp slugShape :: SegBinOpSlug -> Shape slugShape = segBinOpShape . slugOp slugsComm :: [SegBinOpSlug] -> Commutativity slugsComm = mconcat . map (segBinOpComm . slugOp) slugSplitParams :: SegBinOpSlug -> ([LParam GPUMem], [LParam GPUMem]) slugSplitParams slug = splitAt (length (slugNeutral slug)) $ slugParams slug slugBlockRedArrs :: SegBinOpSlug -> [VName] slugBlockRedArrs = blockRedArrs . slugInterms slugPrivChunks :: SegBinOpSlug -> [VName] slugPrivChunks = privateChunks . slugInterms slugCollCopyArrs :: SegBinOpSlug -> [VName] slugCollCopyArrs = collCopyArrs . slugInterms reductionStageOne :: [VName] -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> [SegBinOpSlug] -> DoSegBody -> InKernelGen [Lambda GPUMem] reductionStageOne gtids n global_tid q chunk threads_per_segment slugs body_cont = do constants <- kernelConstants <$> askEnv let glb_ind_var = mkTV (last gtids) int64 ltid = sExt64 $ kernelLocalThreadId constants dScope Nothing $ scopeOfLParams $ concatMap slugParams slugs sComment "ne-initialise the outer (per-block) accumulator(s)" $ do forM_ slugs $ \slug -> forM2_ (slugAccs slug) (slugNeutral slug) $ \(acc, acc_is) ne -> sLoopNest (slugShape slug) $ \vec_is -> copyDWIMFix acc (acc_is ++ vec_is) ne [] new_lambdas <- mapM (renameLambda . slugLambda) slugs let tblock_size = sExt32 $ kernelBlockSize constants let doBlockReduce = forM2_ slugs new_lambdas $ \slug new_lambda -> do let accs = slugAccs slug let params = slugParams slug sLoopNest (slugShape slug) $ \vec_is -> do let block_red_arrs = slugBlockRedArrs slug sComment "store accs. prims go in lmem; non-prims in params (in global mem)" $ forM_ (zip3 block_red_arrs accs params) $ \(arr, (acc, acc_is), p) -> if isPrimParam p then copyDWIMFix arr [ltid] (Var acc) (acc_is ++ vec_is) else copyDWIMFix (paramName p) [] (Var acc) (acc_is ++ vec_is) sOp $ Imp.ErrorSync Imp.FenceLocal -- Also implicitly barrier. blockReduce tblock_size new_lambda block_red_arrs sOp $ Imp.Barrier Imp.FenceLocal sComment "thread 0 updates per-block acc(s); rest reset to ne" $ do sIf (ltid .==. 0) ( forM2_ accs (lambdaParams new_lambda) $ \(acc, acc_is) p -> copyDWIMFix acc (acc_is ++ vec_is) (Var $ paramName p) [] ) ( forM2_ accs (slugNeutral slug) $ \(acc, acc_is) ne -> copyDWIMFix acc (acc_is ++ vec_is) ne [] ) case (slugsComm slugs, all (isPrimSegBinOp . slugOp) slugs) of (Noncommutative, True) -> noncommPrimParamsStageOneBody slugs body_cont glb_ind_var global_tid q n chunk doBlockReduce _ -> generalStageOneBody slugs body_cont glb_ind_var global_tid q n threads_per_segment doBlockReduce pure new_lambdas generalStageOneBody :: [SegBinOpSlug] -> DoSegBody -> TV Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> InKernelGen () -> InKernelGen () generalStageOneBody slugs body_cont glb_ind_var global_tid q n threads_per_segment doBlockReduce = do let is_comm = slugsComm slugs == Commutative constants <- kernelConstants <$> askEnv let tblock_size = kernelBlockSize constants ltid = sExt64 $ kernelLocalThreadId constants -- this block's id within its designated segment, and this block's initial -- global offset. tblock_id_in_segment <- dPrimVE "tblock_id_in_segment" $ global_tid `quot` tblock_size block_base_offset <- dPrimVE "block_base_offset" $ tblock_id_in_segment * q * tblock_size sFor "i" q $ \i -> do block_offset <- dPrimVE "block_offset" $ block_base_offset + i * tblock_size glb_ind_var <-- if is_comm then global_tid + threads_per_segment * i else block_offset + ltid sWhen (tvExp glb_ind_var .<. n) $ sComment "apply map function(s)" $ body_cont $ \all_red_res -> do let maps_res = chunks (map (length . slugNeutral) slugs) all_red_res forM2_ slugs maps_res $ \slug map_res -> sLoopNest (slugShape slug) $ \vec_is -> do let (acc_params, next_params) = slugSplitParams slug sComment "load accumulator(s)" $ forM2_ acc_params (slugAccs slug) $ \p (acc, acc_is) -> copyDWIMFix (paramName p) [] (Var acc) (acc_is ++ vec_is) sComment "load next value(s)" $ forM2_ next_params map_res $ \p (res, res_is) -> copyDWIMFix (paramName p) [] res (res_is ++ vec_is) sComment "apply reduction operator(s)" $ compileStms mempty (bodyStms $ slugBody slug) $ sComment "store in accumulator(s)" $ forM2_ (slugAccs slug) (map resSubExp $ bodyResult $ slugBody slug) $ \(acc, acc_is) se -> copyDWIMFix acc (acc_is ++ vec_is) se [] unless is_comm doBlockReduce sOp $ Imp.ErrorSync Imp.FenceLocal when is_comm doBlockReduce noncommPrimParamsStageOneBody :: [SegBinOpSlug] -> DoSegBody -> TV Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> Imp.TExp Int64 -> InKernelGen () -> InKernelGen () noncommPrimParamsStageOneBody slugs body_cont glb_ind_var global_tid q n chunk doLMemBlockReduce = do constants <- kernelConstants <$> askEnv let tblock_size = kernelBlockSize constants ltid = sExt64 $ kernelLocalThreadId constants -- this block's id within its designated segment; the stride made per block in -- the outer `i < q` loop; and this block's initial global offset. tblock_id_in_segment <- dPrimVE "block_offset_in_segment" $ global_tid `quot` tblock_size block_stride <- dPrimVE "block_stride" $ tblock_size * chunk block_base_offset <- dPrimVE "block_base_offset" $ tblock_id_in_segment * q * block_stride let chunkLoop = sFor "k" chunk sFor "i" q $ \i -> do -- block offset in this iteration. block_offset <- dPrimVE "block_offset" $ block_base_offset + i * block_stride chunkLoop $ \k -> do loc_ind <- dPrimVE "loc_ind" $ k * tblock_size + ltid glb_ind_var <-- block_offset + loc_ind sIf (tvExp glb_ind_var .<. n) ( body_cont $ \all_red_res -> do let slugs_res = chunks (map (length . slugNeutral) slugs) all_red_res forM2_ slugs slugs_res $ \slug slug_res -> do let priv_chunks = slugPrivChunks slug sComment "write map result(s) to private chunk(s)" $ forM2_ priv_chunks slug_res $ \priv_chunk (res, res_is) -> copyDWIMFix priv_chunk [k] res res_is ) -- if out of bounds, fill chunk(s) with neutral element(s) ( forM_ slugs $ \slug -> forM2_ (slugPrivChunks slug) (slugNeutral slug) $ \priv_chunk ne -> copyDWIMFix priv_chunk [k] ne [] ) sOp $ Imp.ErrorSync Imp.FenceLocal sComment "effectualize collective copies in local memory" $ do forM_ slugs $ \slug -> do let coll_copy_arrs = slugCollCopyArrs slug let priv_chunks = slugPrivChunks slug forM2_ coll_copy_arrs priv_chunks $ \lmem_arr priv_chunk -> do chunkLoop $ \k -> do lmem_idx <- dPrimVE "lmem_idx" $ ltid + k * tblock_size copyDWIMFix lmem_arr [lmem_idx] (Var priv_chunk) [k] sOp $ Imp.Barrier Imp.FenceLocal chunkLoop $ \k -> do lmem_idx <- dPrimVE "lmem_idx" $ ltid * chunk + k copyDWIMFix priv_chunk [k] (Var lmem_arr) [lmem_idx] sOp $ Imp.Barrier Imp.FenceLocal sComment "per-thread sequential reduction of private chunk(s)" $ do chunkLoop $ \k -> forM_ slugs $ \slug -> do let accs = map fst $ slugAccs slug let (acc_ps, next_ps) = slugSplitParams slug let ps_accs_chunks = zip4 acc_ps next_ps accs (slugPrivChunks slug) sComment "load params for all reductions" $ do forM_ ps_accs_chunks $ \(acc_p, next_p, acc, priv_chunk) -> do copyDWIM (paramName acc_p) [] (Var acc) [] copyDWIMFix (paramName next_p) [] (Var priv_chunk) [k] sComment "apply reduction operator(s)" $ do let binop_ress = map resSubExp $ bodyResult $ slugBody slug compileStms mempty (bodyStms $ slugBody slug) $ forM2_ accs binop_ress $ \acc binop_res -> copyDWIM acc [] binop_res [] doLMemBlockReduce sOp $ Imp.ErrorSync Imp.FenceLocal reductionStageTwo :: [PatElem LetDecMem] -> Imp.TExp Int32 -> [Imp.TExp Int64] -> Imp.TExp Int64 -> Imp.TExp Int64 -> SegBinOpSlug -> Lambda GPUMem -> VName -> VName -> Imp.TExp Int64 -> InKernelGen () reductionStageTwo segred_pes tblock_id segment_gtids first_block_for_segment blocks_per_segment slug new_lambda counters sync_arr counter_idx = do constants <- kernelConstants <$> askEnv let ltid32 = kernelLocalThreadId constants ltid = sExt64 ltid32 tblock_size = kernelBlockSize constants let (acc_params, next_params) = slugSplitParams slug nes = slugNeutral slug red_arrs = slugBlockRedArrs slug block_res_arrs = blockResArrs slug old_counter <- dPrim "old_counter" int32 (counter_mem, _, counter_offset) <- fullyIndexArray counters [counter_idx] sComment "first thread in block saves block result to global memory" $ sWhen (ltid32 .==. 0) $ do forM_ (take (length nes) $ zip block_res_arrs (slugAccs slug)) $ \(v, (acc, acc_is)) -> copyDWIMFix v [0, sExt64 tblock_id] (Var acc) acc_is sOp $ Imp.MemFence Imp.FenceGlobal -- Increment the counter, thus stating that our result is -- available. sOp $ Imp.Atomic DefaultSpace $ Imp.AtomicAdd Int32 (tvVar old_counter) counter_mem counter_offset $ untyped (1 :: Imp.TExp Int32) -- Now check if we were the last block to write our result. If -- so, it is our responsibility to produce the final result. sWrite sync_arr [0] $ untyped $ tvExp old_counter .==. blocks_per_segment - 1 sOp $ Imp.Barrier Imp.FenceGlobal is_last_block <- dPrim "is_last_block" Bool copyDWIMFix (tvVar is_last_block) [] (Var sync_arr) [0] sWhen (tvExp is_last_block) $ do -- The final block has written its result (and it was -- us!), so read in all the block results and perform the -- final stage of the reduction. But first, we reset the -- counter so it is ready for next time. This is done -- with an atomic to avoid warnings about write/write -- races in oclgrind. sWhen (ltid32 .==. 0) $ sOp $ Imp.Atomic DefaultSpace $ Imp.AtomicAdd Int32 (tvVar old_counter) counter_mem counter_offset $ untyped $ negate blocks_per_segment sLoopNest (slugShape slug) $ \vec_is -> do unless (null $ slugShape slug) $ sOp (Imp.Barrier Imp.FenceLocal) -- There is no guarantee that the number of threadblocks for the -- segment is less than the threadblock size, so each thread may -- have to read multiple elements. We do this in a sequential -- way that may induce non-coalesced accesses, but the total -- number of accesses should be tiny here. -- -- TODO: here we *could* insert a collective copy of the num_tblocks -- per-block results. However, it may not be beneficial, since num_tblocks -- is not necessarily larger than tblock_size, meaning the number of -- uncoalesced reads here may be insignificant. In fact, if we happen to -- have a num_tblocks < tblock_size, then the collective copy would add -- unnecessary overhead. Also, this code is only executed by a single -- block. sComment "read in the per-block-results" $ do read_per_thread <- dPrimVE "read_per_thread" $ blocks_per_segment `divUp` sExt64 tblock_size forM2_ acc_params nes $ \p ne -> copyDWIM (paramName p) [] ne [] sFor "i" read_per_thread $ \i -> do block_res_id <- dPrimVE "block_res_id" $ ltid * read_per_thread + i index_of_block_res <- dPrimVE "index_of_block_res" $ first_block_for_segment + block_res_id sWhen (block_res_id .<. blocks_per_segment) $ do forM2_ next_params block_res_arrs $ \p block_res_arr -> copyDWIMFix (paramName p) [] (Var block_res_arr) ([0, index_of_block_res] ++ vec_is) compileStms mempty (bodyStms $ slugBody slug) $ forM2_ acc_params (map resSubExp $ bodyResult $ slugBody slug) $ \p se -> copyDWIMFix (paramName p) [] se [] forM2_ acc_params red_arrs $ \p arr -> when (isPrimParam p) $ copyDWIMFix arr [ltid] (Var $ paramName p) [] sOp $ Imp.ErrorSync Imp.FenceLocal sComment "reduce the per-block results" $ do blockReduce (sExt32 tblock_size) new_lambda red_arrs sComment "and back to memory with the final result" $ sWhen (ltid32 .==. 0) $ forM2_ segred_pes (lambdaParams new_lambda) $ \pe p -> copyDWIMFix (patElemName pe) (segment_gtids ++ vec_is) (Var $ paramName p) [] -- Note [IntermArrays] -- -- Intermediate memory for the nonsegmented and large segments non-commutative -- reductions with all primitive parameters: -- -- These kernels need local memory for 1) the initial collective copy, 2) the -- (virtualized) block reductions, and (TODO: this one not implemented yet!) -- 3) the final single-block collective copy. There are no dependencies -- between these three stages, so we can reuse the same pool of local mem for -- all three. These intermediates all go into local mem because of the -- assumption of primitive parameter types. -- -- Let `elem_sizes` be a list of element type sizes for the reduction -- operators in a given redomap fusion. Then the amount of local mem needed -- across the three steps are: -- -- 1) The initial collective copy from global to thread-private memory -- requires `tblock_size * CHUNK * max elem_sizes`, since the collective copies -- are performed in sequence (ie. inputs to different reduction operators need -- not be held in local mem simultaneously). -- 2) The intra-block reductions of local memory held per-thread results -- require `tblock_size * sum elem_sizes` bytes, since per-thread results for -- all fused reductions are block-reduced simultaneously. -- 3) If tblock_size < num_tblocks, then after the final single-block collective -- copy, a thread-sequential reduction reduces the number of per-block partial -- results from num_tblocks down to tblock_size for each reduction array, such -- that they will each fit in the final intra-block reduction. This requires -- `num_tblocks * max elem_sizes`. -- -- In summary, the total amount of local mem needed is the maximum between: -- 1) initial collective copy: tblock_size * CHUNK * max elem_sizes -- 2) intra-block reductions: tblock_size * sum elem_sizes -- 3) final collective copy: num_tblocks * max elem_sizes -- -- The amount of local mem will most likely be decided by 1) in most cases, -- unless the number of fused operators is very high *or* if we have a -- `num_tblocks > tblock_size * CHUNK`, but this is unlikely, in which case 2) -- and 3), respectively, will dominate. -- -- Aside from local memory, these kernels also require a CHUNK-sized array of -- thread-private register memory per reduction operator. -- -- For all other reductions, ie. commutative reductions, reductions with at -- least one non-primitive operator, and small segments reductions: -- -- These kernels use local memory only for the intra-block reductions, and -- since they do not use chunking or CHUNK, they all require onlly `tblock_size -- * max elem_sizes` bytes of local memory and no thread-private register mem.