GENIE
scan.cu
Go to the documentation of this file.
1 /*
2  * This module contains source code provided by NVIDIA Corporation.
3  */
4 
5 #include <assert.h>
7 
8 #include "scan.h"
9 
11 
17 
18 // Naive inclusive scan: O(N * log2(N)) operations
19 // Allocate 2 * 'size' local memory, initialize the first half with 'size' zeros avoiding if(pos >= offset) condition
20 // evaluation and saving instructions
21 inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size)
22 {
23  uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
24  s_Data[pos] = 0;
25  pos += size;
26  s_Data[pos] = idata;
27 
28  for (uint offset = 1; offset < size; offset <<= 1)
29  {
30  __syncthreads();
31  uint t = s_Data[pos] + s_Data[pos - offset];
32  __syncthreads();
33  s_Data[pos] = t;
34  }
35 
36  return s_Data[pos];
37 }
38 
39 inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size)
40 {
41  return scan1Inclusive(idata, s_Data, size) - idata;
42 }
43 
44 
45 inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size)
46 {
47  //Level-0 inclusive scan
48  idata4.y += idata4.x;
49  idata4.z += idata4.y;
50  idata4.w += idata4.z;
51 
52  //Level-1 exclusive scan
53  uint oval = scan1Exclusive(idata4.w, s_Data, size / 4);
54 
55  idata4.x += oval;
56  idata4.y += oval;
57  idata4.z += oval;
58  idata4.w += oval;
59 
60  return idata4;
61 }
62 
63 // Exclusive vector scan: the array to be scanned is stored in local thread memory scope as uint4
64 inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size)
65 {
66  uint4 odata4 = scan4Inclusive(idata4, s_Data, size);
67  odata4.x -= idata4.x;
68  odata4.y -= idata4.y;
69  odata4.z -= idata4.z;
70  odata4.w -= idata4.w;
71  return odata4;
72 }
73 
75  uint4 *d_Dst,
76  uint4 *d_Src,
77  uint activeThreads,
78  uint pow2size)
79 {
80  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
81 
82  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
83 
84  //Load data
85  uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
86 
87  //Calculate exclusive scan
88  uint4 odata4 = scan4Exclusive(idata4, s_Data, pow2size);
89 
90  //Write back
91  if (pos < activeThreads)
92  d_Dst[pos] = odata4;
93 }
94 
96  uint4 *d_Dst,
97  uint4 *d_Src,
98  uint activeThreads,
99  uint pow2size)
100 {
101  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
102 
103  uint pos = threadIdx.x;
104 
105  //Load data
106  uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
107 
108  //Calculate exclusive scan
109  uint4 odata4 = scan4Exclusive(idata4, s_Data, pow2size);
110 
111  //Write back
112  if (pos < activeThreads)
113  d_Dst[pos] = odata4;
114 }
115 
117  uint4 *d_Dst,
118  uint4 *d_Src,
119  uint activeThreads,
120  uint pow2size)
121 {
122  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
123 
124  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
125 
126  //Load data
127  uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
128 
129  //Calculate exclusive scan
130  uint4 odata4 = scan4Exclusive(idata4, s_Data, pow2size);
131 
132  //Write back
133  if (pos < activeThreads)
134  d_Dst[pos] = odata4;
135 }
136 
138  uint4 *d_Dst,
139  uint4 *d_Src,
140  uint activeThreads,
141  uint pow2size)
142 {
143  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
144 
145  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
146 
147  //Load data
148  uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
149 
150  //Calculate exclusive scan
151  uint4 odata4 = scan4Inclusive(idata4, s_Data, pow2size);
152 
153  //Write back
154  if (pos < activeThreads)
155  d_Dst[pos] = odata4;
156 }
157 
159  uint4 *d_Dst,
160  uint4 *d_Src,
161  uint activeThreads,
162  uint pow2size)
163 {
164  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
165 
166  uint pos = threadIdx.x;
167 
168  //Load data
169  uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
170 
171  //Calculate exclusive scan
172  uint4 odata4 = scan4Inclusive(idata4, s_Data, pow2size);
173 
174  //Write back
175  if (pos < activeThreads)
176  d_Dst[pos] = odata4;
177 }
178 
180  uint4 *d_Dst,
181  uint4 *d_Src,
182  uint activeThreads,
183  uint pow2size)
184 {
185  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
186 
187  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
188 
189  //Load data
190  uint4 idata4 = (pos < activeThreads) ? d_Src[pos] : uint4{0,0,0,0};
191 
192  //Calculate exclusive scan
193  uint4 odata4 = scan4Inclusive(idata4, s_Data, pow2size);
194 
195  //Write back
196  if (pos < activeThreads)
197  d_Dst[pos] = odata4;
198 }
199 
200 //Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
201 __global__ void scanExclusiveShared2(
202  uint *d_Buf,
203  uint *d_Dst,
204  uint *d_Src,
205  uint arrayLength,
206  uint blocks)
207 {
208  __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
209 
210  //Skip loads and stores for inactive threads of last threadblock (pos >= blocks)
211  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
212 
213  //Load top elements
214  //Convert results of bottom-level scan back to inclusive
215  uint idata = 0;
216 
217  uint sumLocation;
218  if (pos < blocks - 1)
219  sumLocation = (4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos;
220  else
221  sumLocation = arrayLength;
222 
223  idata =
224  d_Dst[sumLocation] + d_Src[sumLocation];
225 
226  //Compute
227  uint odata = scan1Exclusive(idata, s_Data, blocks);
228 
229  //Avoid out-of-bound access
230  if (pos < blocks)
231  {
232  d_Buf[pos] = odata;
233  }
234 }
235 
236 //Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
237 __global__ void uniformUpdate(
238  uint4 *d_Data,
239  uint *d_Buffer,
240  uint arrayLength)
241 {
242  __shared__ uint buf;
243  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
244 
245  if (threadIdx.x == 0)
246  {
247  buf = d_Buffer[blockIdx.x];
248  }
249 
250  __syncthreads();
251 
252  if (pos < arrayLength)
253  {
254  uint4 data4 = d_Data[pos];
255  data4.x += buf;
256  data4.y += buf;
257  data4.z += buf;
258  data4.w += buf;
259  d_Data[pos] = data4;
260  }
261 }
262 
263 
264 //Internal exclusive scan buffer
265 static uint *d_Buf;
266 
268 {
269  cudaCheckErrors(cudaMalloc((void **)&d_Buf, THREADBLOCK_SIZE * sizeof(uint)));
270 }
271 
273 {
274  cudaCheckErrors(cudaFree(d_Buf));
275 }
276 
277 
278 // Returns the first power of two greater or equal to x
279 __device__ uint genie::utility::d_pow2ceil_32 (uint x)
280 {
281  if (x == 0)
282  return 0;
283  --x;
284  x |= x >> 1;
285  x |= x >> 2;
286  x |= x >> 4;
287  x |= x >> 8;
288  x |= x >> 16;
289  return x+1;
290 }
291 
292 // Returns the first power of two greater or equal to x
294 {
295  if (x == 0)
296  return 0;
297  --x;
298  x |= x >> 1;
299  x |= x >> 2;
300  x |= x >> 4;
301  x |= x >> 8;
302  x |= x >> 16;
303  return x+1;
304 }
305 
306 static uint iDivUp(uint dividend, uint divisor)
307 {
308  return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1);
309 }
310 
312  uint *d_Dst,
313  uint *d_Src,
314  uint arrayLength)
315 {
316  // Check the array length is a mutliple of 4. This is because we use uint4 processed by a single thread.
317  assert(arrayLength % 4 == 0);
318 
319  //Check power-of-two factorization
320  uint pow2arrayLength = h_pow2ceil_32(arrayLength);
321  // printf("power of two size: %u\n", pow2arrayLength);
322  assert(pow2arrayLength >= arrayLength);
323 
324  // Check supported size range
325  assert((pow2arrayLength >= SCAN_MIN_SHORT_ARRAY_SIZE) && (pow2arrayLength <= SCAN_MAX_SHORT_ARRAY_SIZE));
326 
327  // printf("running scanExclusiveShort on %d blocks each of %d threads, total active threads: %d\n",
328  // (pow2arrayLength+(4*THREADBLOCK_SIZE)-1)/(4*THREADBLOCK_SIZE),THREADBLOCK_SIZE, arrayLength/4);
329 
331  (uint4 *)d_Dst,
332  (uint4 *)d_Src,
333  arrayLength / 4,
334  pow2arrayLength
335  );
336  CUDA_LAST_ERROR();
337 
338  return THREADBLOCK_SIZE;
339 }
340 
342  uint *d_Dst,
343  uint *d_Src,
344  uint arrayLength)
345 {
346  // Check the array length is a mutliple of 4. This is because we use uint4 processed by a single thread.
347  assert(arrayLength % 4 == 0);
348 
349  //Check power-of-two factorization
350  uint pow2arrayLength = h_pow2ceil_32(arrayLength);
351  // printf("power of two size: %u\n", pow2arrayLength);
352  assert(pow2arrayLength >= (arrayLength));
353 
354  //Check supported size range
355  assert((pow2arrayLength >= SCAN_MIN_LARGE_ARRAY_SIZE) && (pow2arrayLength <= SCAN_MAX_LARGE_ARRAY_SIZE));
356 
357  // printf("running scanExclusiveLong on %d blocks each of %d threads\n",
358  // (pow2arrayLength + (4 * THREADBLOCK_SIZE) - 1) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE);
359 
360  g_scanExclusiveShared<<<(pow2arrayLength + (4 * THREADBLOCK_SIZE) - 1) / (4 * THREADBLOCK_SIZE),
361  THREADBLOCK_SIZE>>>(
362  (uint4 *)d_Dst,
363  (uint4 *)d_Src,
364  arrayLength / 4,
365  4 * THREADBLOCK_SIZE
366  );
367  CUDA_LAST_ERROR();
368 
369  //Not all threadblocks need to be packed with input data:
370  //inactive threads of highest threadblock just don't do global reads and writes
371  const uint blockCount2 = iDivUp(pow2arrayLength / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE);
372  scanExclusiveShared2<<< blockCount2, THREADBLOCK_SIZE>>>(
373  (uint *)d_Buf,
374  (uint *)d_Dst,
375  (uint *)d_Src,
376  arrayLength, // uses the original arrayLength for uint array, unlike scanExclusiveShared and uniformUpdate
377  pow2arrayLength / (4 * THREADBLOCK_SIZE)
378  );
379  CUDA_LAST_ERROR();
380 
381  uniformUpdate<<<pow2arrayLength / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
382  (uint4 *)d_Dst,
383  (uint *)d_Buf,
384  (arrayLength + 3) / 4
385  );
386  CUDA_LAST_ERROR();
387 
388  return THREADBLOCK_SIZE;
389 }
390 
392  uint *dst,
393  uint *src,
394  uint arrayLength)
395 {
396  dst[0] = 0;
397 
398  for (uint j = 1; j < arrayLength; j++)
399  dst[j] = src[j - 1] + dst[j - 1];
400 }
401 
402 
403 
const unsigned int SCAN_THREADBLOCK_SIZE
Definition: scan.cu:12
__device__ void d_scanExclusivePerBlockShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:95
__device__ void d_scanInclusivePerBlockShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:158
void initScan(void)
Definition: scan.cu:267
#define CUDA_LAST_ERROR()
Definition: cuda_macros.h:33
#define GPUGENIE_SCAN_THREADBLOCK_SIZE
Definition: scan.h:10
const unsigned int SCAN_MIN_LARGE_ARRAY_SIZE
Definition: scan.cu:15
uint h_pow2ceil_32(uint x)
Definition: scan.cu:293
#define GPUGENIE_SCAN_MAX_SHORT_ARRAY_SIZE
Definition: scan.h:12
size_t scanExclusiveShort(unsigned int *d_Dst, unsigned int *d_Src, unsigned int arrayLength)
__device__ void d_scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:116
#define GPUGENIE_SCAN_MIN_LARGE_ARRAY_SIZE
Definition: scan.h:13
size_t scanExclusiveLarge(unsigned int *d_Dst, unsigned int *d_Src, unsigned int arrayLength)
__global__ void g_scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:74
__global__ void scanExclusiveShared2(uint *d_Buf, uint *d_Dst, uint *d_Src, uint arrayLength, uint blocks)
Definition: scan.cu:201
void closeScan(void)
Definition: scan.cu:272
#define GPUGENIE_SCAN_MIN_SHORT_ARRAY_SIZE
Definition: scan.h:11
__device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size)
Definition: scan.cu:64
__device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size)
Definition: scan.cu:21
__device__ uint d_pow2ceil_32(uint x)
Definition: scan.cu:279
__device__ void d_scanInclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:179
__global__ void g_scanInclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:137
const unsigned int SCAN_MAX_SHORT_ARRAY_SIZE
Definition: scan.cu:14
void scanExclusiveHost(unsigned int *dst, unsigned int *src, unsigned int arrayLength)
__device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size)
Definition: scan.cu:45
const uint THREADBLOCK_SIZE
Definition: scan.cu:10
#define GPUGENIE_SCAN_MAX_LARGE_ARRAY_SIZE
Definition: scan.h:14
__global__ void uniformUpdate(uint4 *d_Data, uint *d_Buffer, uint arrayLength)
Definition: scan.cu:237
__device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size)
Definition: scan.cu:39
const unsigned int SCAN_MIN_SHORT_ARRAY_SIZE
Definition: scan.cu:13
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.
Definition: cuda_macros.h:23
const unsigned int SCAN_MAX_LARGE_ARRAY_SIZE
Definition: scan.cu:16