21 inline __device__ uint
scan1Inclusive(uint idata,
volatile uint *s_Data, uint size)
23 uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
28 for (uint offset = 1; offset < size; offset <<= 1)
31 uint t = s_Data[pos] + s_Data[pos - offset];
39 inline __device__ uint
scan1Exclusive(uint idata,
volatile uint *s_Data, uint size)
45 inline __device__ uint4
scan4Inclusive(uint4 idata4,
volatile uint *s_Data, uint size)
64 inline __device__ uint4
scan4Exclusive(uint4 idata4,
volatile uint *s_Data, uint size)
82 uint pos = blockIdx.x * blockDim.x + threadIdx.x;
85 uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
91 if (pos < activeThreads)
103 uint pos = threadIdx.x;
106 uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
112 if (pos < activeThreads)
124 uint pos = blockIdx.x * blockDim.x + threadIdx.x;
127 uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
133 if (pos < activeThreads)
145 uint pos = blockIdx.x * blockDim.x + threadIdx.x;
148 uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
154 if (pos < activeThreads)
166 uint pos = threadIdx.x;
169 uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
175 if (pos < activeThreads)
187 uint pos = blockIdx.x * blockDim.x + threadIdx.x;
190 uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
196 if (pos < activeThreads)
211 uint pos = blockIdx.x * blockDim.x + threadIdx.x;
218 if (pos < blocks - 1)
221 sumLocation = arrayLength;
224 d_Dst[sumLocation] + d_Src[sumLocation];
243 uint pos = blockIdx.x * blockDim.x + threadIdx.x;
245 if (threadIdx.x == 0)
247 buf = d_Buffer[blockIdx.x];
252 if (pos < arrayLength)
254 uint4 data4 = d_Data[pos];
306 static uint iDivUp(uint dividend, uint divisor)
308 return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1);
317 assert(arrayLength % 4 == 0);
322 assert(pow2arrayLength >= arrayLength);
347 assert(arrayLength % 4 == 0);
352 assert(pow2arrayLength >= (arrayLength));
372 scanExclusiveShared2<<< blockCount2, THREADBLOCK_SIZE>>>(
384 (arrayLength + 3) / 4
398 for (uint j = 1; j < arrayLength; j++)
399 dst[j] = src[j - 1] + dst[j - 1];
const unsigned int SCAN_THREADBLOCK_SIZE
__device__ void d_scanExclusivePerBlockShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
__device__ void d_scanInclusivePerBlockShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
#define CUDA_LAST_ERROR()
#define GPUGENIE_SCAN_THREADBLOCK_SIZE
const unsigned int SCAN_MIN_LARGE_ARRAY_SIZE
uint h_pow2ceil_32(uint x)
#define GPUGENIE_SCAN_MAX_SHORT_ARRAY_SIZE
size_t scanExclusiveShort(unsigned int *d_Dst, unsigned int *d_Src, unsigned int arrayLength)
__device__ void d_scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
#define GPUGENIE_SCAN_MIN_LARGE_ARRAY_SIZE
size_t scanExclusiveLarge(unsigned int *d_Dst, unsigned int *d_Src, unsigned int arrayLength)
__global__ void g_scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
__global__ void scanExclusiveShared2(uint *d_Buf, uint *d_Dst, uint *d_Src, uint arrayLength, uint blocks)
#define GPUGENIE_SCAN_MIN_SHORT_ARRAY_SIZE
__device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size)
__device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size)
__device__ uint d_pow2ceil_32(uint x)
__device__ void d_scanInclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
__global__ void g_scanInclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
const unsigned int SCAN_MAX_SHORT_ARRAY_SIZE
void scanExclusiveHost(unsigned int *dst, unsigned int *src, unsigned int arrayLength)
__device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size)
const uint THREADBLOCK_SIZE
#define GPUGENIE_SCAN_MAX_LARGE_ARRAY_SIZE
__global__ void uniformUpdate(uint4 *d_Data, uint *d_Buffer, uint arrayLength)
__device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size)
const unsigned int SCAN_MIN_SHORT_ARRAY_SIZE
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.
const unsigned int SCAN_MAX_LARGE_ARRAY_SIZE