/* ----------------------------------------------------------------------------- * * Module : Fold * Copyright : (c) [2009..2010] Trevor L. McDonell * License : BSD * * Reduce an array to a single value with a binary associative function * * ---------------------------------------------------------------------------*/ #include #ifndef BLOCK_SIZE #define BLOCK_SIZE blockDim.x #endif #ifndef LENGTH_IS_POW_2 #define LENGTH_IS_POW_2 0 #endif /* * Compute multiple elements per thread sequentially. This reduces the overall * cost of the algorithm while keeping the work complexity O(n) and the step * complexity O(log n). c.f. Brent's Theorem optimisation. */ extern "C" __global__ void fold ( ArrOut d_out, const ArrIn0 d_in0, const Ix shape ) { extern __shared__ TyOut scratch[]; /* * Calculate first level of reduction reading into shared memory */ Ix i; const Ix tid = threadIdx.x; const Ix gridSize = BLOCK_SIZE * 2 * gridDim.x; scratch[tid] = identity(); /* * Reduce multiple elements per thread. The number is determined by the * number of active thread blocks (via gridDim). More blocks will result in * a larger `gridSize', and hence fewer elements per thread * * The loop stride of `gridSize' is used to maintain coalescing. */ for (i = blockIdx.x * BLOCK_SIZE * 2 + tid; i < shape; i += gridSize) { scratch[tid] = apply(scratch[tid], get0(d_in0, i)); /* * Ensure we don't read out of bounds. This is optimised away if the * input length is a power of two */ if (LENGTH_IS_POW_2 || i + BLOCK_SIZE < shape) scratch[tid] = apply(scratch[tid], get0(d_in0, i+BLOCK_SIZE)); } __syncthreads(); /* * Now, calculate the reduction in shared memory */ if (BLOCK_SIZE >= 512) { if (tid < 256) { scratch[tid] = apply(scratch[tid], scratch[tid+256]); } __syncthreads(); } if (BLOCK_SIZE >= 256) { if (tid < 128) { scratch[tid] = apply(scratch[tid], scratch[tid+128]); } __syncthreads(); } if (BLOCK_SIZE >= 128) { if (tid < 64) { scratch[tid] = apply(scratch[tid], scratch[tid+ 64]); } __syncthreads(); } #ifndef __DEVICE_EMULATION__ if (tid < 32) #endif { if (BLOCK_SIZE >= 64) { scratch[tid] = apply(scratch[tid], scratch[tid+32]); __EMUSYNC; } if (BLOCK_SIZE >= 32) { scratch[tid] = apply(scratch[tid], scratch[tid+16]); __EMUSYNC; } if (BLOCK_SIZE >= 16) { scratch[tid] = apply(scratch[tid], scratch[tid+ 8]); __EMUSYNC; } if (BLOCK_SIZE >= 8) { scratch[tid] = apply(scratch[tid], scratch[tid+ 4]); __EMUSYNC; } if (BLOCK_SIZE >= 4) { scratch[tid] = apply(scratch[tid], scratch[tid+ 2]); __EMUSYNC; } if (BLOCK_SIZE >= 2) { scratch[tid] = apply(scratch[tid], scratch[tid+ 1]); __EMUSYNC; } } /* * Write the results of this block back to global memory */ if (tid == 0) set(d_out, blockIdx.x, scratch[0]); }