GENIE
match_integrated.cu
Go to the documentation of this file.
1 
2 #include <algorithm>
3 #include <iomanip>
4 #include <iostream>
5 #include <stdlib.h>
6 #include <string>
7 #include <sstream>
8 #include <math.h>
9 
10 #include <thrust/copy.h>
11 #include <thrust/device_vector.h>
12 
14 #include <genie/utility/Logger.h>
16 #include <genie/utility/Timing.h>
23 
24 #include "match_common.h"
25 #include "match_integrated.h"
26 #include "match_device_utils.h"
27 
28 
35 #define GPUGENIE_INTEGRATED_KERNEL_SM_SIZE (1024)
36 
37 using namespace genie::compression;
38 using namespace genie::matching;
39 using namespace genie::table;
40 using namespace genie::utility;
41 using namespace genie::query;
42 using namespace std;
43 using namespace thrust;
44 
45 namespace genie
46 {
47 namespace matching
48 {
49 
50 // Instances of all possible matching functions
51 // TODO: Split this into multiple compilation units
52 
54  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
55  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
56 
58  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
59  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
60 
62  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
63  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
64 
66  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
67  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
68 
69 template void match_integrated<DeviceCompositeCodec<DeviceBitPackingCodec,DeviceCopyCodec>>(
70  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
71  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
72 
73 template void match_integrated<DeviceCompositeCodec<DeviceBitPackingCodec,DeviceVarintCodec>>(
74  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
75  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
76 
77 template void match_integrated<DeviceSerialCodec<DeviceCopyCodec,DeviceCopyCodec>>(
78  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
79  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
80 
81 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceCopyCodec>>(
82  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
83  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
84 
85 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceDeltaCodec>>(
86  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
87  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
88 
89 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceVarintCodec>>(
90  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
91  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
92 
93 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceBitPackingCodec>>(
94  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
95  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
96 
97 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceCompositeCodec<DeviceBitPackingCodec,DeviceCopyCodec>>>(
98  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
99  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
100 
101 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceCompositeCodec<DeviceBitPackingCodec,DeviceVarintCodec>>>(
102  inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
103  int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
104 
105 
106 int getBitmapSize(int &in_out_bitmap_bits, u32 in_shift_bits_subsequence, int in_number_of_data_points, int in_queries_size)
107 {
108  if (in_out_bitmap_bits < 0)
109  in_out_bitmap_bits = -in_out_bitmap_bits; //for hash_table_size, still let it determine by users currently
110 
111  int threshold = in_out_bitmap_bits - 1, bitmap_size = 0;
112  if (in_out_bitmap_bits > 1)
113  {
114  float logresult = std::log2((float) in_out_bitmap_bits);
115  in_out_bitmap_bits = (int) logresult;
116  if (logresult - in_out_bitmap_bits > 0)
117  {
118  in_out_bitmap_bits += 1;
119  }
120  logresult = std::log2((float) in_out_bitmap_bits);
121  in_out_bitmap_bits = (int) logresult;
122  if (logresult - in_out_bitmap_bits > 0)
123  {
124  in_out_bitmap_bits += 1;
125  }
126  in_out_bitmap_bits = pow(2, in_out_bitmap_bits);
127  bitmap_size = ((((unsigned int)1<<in_shift_bits_subsequence) * in_number_of_data_points) / (32 / in_out_bitmap_bits) + 1)
128  * in_queries_size;
129  }
130  else
131  {
132  in_out_bitmap_bits = threshold = 0;
133  }
134 
135  Logger::log(Logger::DEBUG, "Bitmap bits: %d, threshold:%d, shift_bits_subsequence: %d",
136  in_out_bitmap_bits, threshold, in_shift_bits_subsequence);
137 
138  return bitmap_size;
139 }
140 
141 
142 int build_compressed_queries(vector<Query>& queries, inv_compr_table *ctable, vector<Query::dim>& dims, int max_load)
143 {
144  assert(ctable->build_status() == inv_table::builded);
145  assert(ctable->shift_bits_sequence == 0);
146 
147  int max_count = -1;
148  for (unsigned int i = 0; i < queries.size(); ++i)
149  {
150  assert (queries[i].ref_table() == ctable);
151  queries[i].build_compressed(max_load);
152 
153  int prev_size = dims.size();
154  queries[i].dump(dims);
155 
156  int count = dims.size() - prev_size;
157 
158  if(count > max_count)
159  max_count = count;
160  }
161 
162  Logger::log(Logger::DEBUG, " dims size: %d.", dims.size());
163  Logger::log(Logger::DEBUG, "max_count: %d", max_count);
164  return max_count;
165 
166 }
167 
168 template <class Codec> __global__ void
170  int m_size, // number of dimensions, i.e. inv_table::m_size()
171  int i_size, // number of instances, i.e. inv_table::m_size() * (1u<<shift_bits_subsequence)
172  int hash_table_size, // hash table size
173  uint32_t* d_compr_inv, // d_uncompr_inv_p points to the start location of uncompr posting list array in GPU memory
174  Query::dim* d_dims, // compiled queries (dim structure) with locations into d_uncompr_inv
175  T_HASHTABLE* hash_table_list, // data_t struct (id, aggregation) array of size queries.size() * hash_table_size
176  u32 * bitmap_list, // of bitmap_size
177  int bitmap_bits,
178  u32* d_topks, // d_topks set to max_topk for all queries
179  u32* d_threshold, //initialized as 1, and increase gradually
180  u32* d_passCount, //initialized as 0, count the number of items passing one d_threshold
181  u32 num_of_max_count, //number of maximum count per query
182  u32 * noiih, // number of integers in a hash table; set to 0 for all queries
183  bool * overflow,
184  unsigned int shift_bits_subsequence)
185 {
186  assert(MATCH_THREADS_PER_BLOCK == blockDim.x);
187 
188  assert(m_size != 0 && i_size != 0);
189 
190  Query::dim& myb_query = d_dims[blockIdx.x];
191  int query_index = myb_query.query;
192  u32* my_noiih = &noiih[query_index];
193  u32* my_threshold = &d_threshold[query_index];
194  u32* my_passCount = &d_passCount[query_index * num_of_max_count]; //
195  u32 my_topk = d_topks[query_index]; //for AT
196 
197  T_HASHTABLE* hash_table = &hash_table_list[query_index * hash_table_size];
198  u32 * bitmap;
199  if (bitmap_bits)
200  bitmap = &bitmap_list[query_index * (i_size / (32 / bitmap_bits) + 1)];
201 
202  assert(myb_query.start_pos < myb_query.end_pos);
203 
204  int min = myb_query.start_pos;
205  int max = myb_query.end_pos;
206  size_t comprLength = max - min;
207  int order = myb_query.order;
208 
209  Codec codec;
210  // check if Codec is compatible with the current list
211  assert(max - min <= codec.decodeArrayParallel_maxBlocks() * codec.decodeArrayParallel_lengthPerBlock());
212  assert(max - min <= gridDim.x * blockDim.x * codec.decodeArrayParallel_threadLoad());
213  assert(blockDim.x == codec.decodeArrayParallel_lengthPerBlock() / codec.decodeArrayParallel_threadLoad());
214 
215  __shared__ uint32_t s_comprInv[GPUGENIE_INTEGRATED_KERNEL_SM_SIZE];
216  __shared__ uint32_t s_decomprInv[GPUGENIE_INTEGRATED_KERNEL_SM_SIZE];
217 
218  int idx = threadIdx.x;
219  // Copy the compressed list from global memory into shared memory
220  // TODO change to more coalesced access (each thread accesses consecutive 128b value)
221  for (int i = 0; i < codec.decodeArrayParallel_lengthPerBlock(); i += codec.decodeArrayParallel_threadsPerBlock())
222  {
223  s_comprInv[idx + i] = (idx + i < (int)comprLength) ? d_compr_inv[idx + i + min] : 0;
224  s_decomprInv[idx + i] = 0;
225  }
226  // set uncompressed length to maximal length, decomprLength also acts as capacity for the codec
227  size_t decomprLength = GPUGENIE_INTEGRATED_KERNEL_SM_SIZE;
228  __syncthreads();
229  codec.decodeArrayParallel(s_comprInv, comprLength, s_decomprInv, decomprLength);
230  __syncthreads();
231 
232  // if (idx == 0)
233  // printf("Block %d, query %d, start_pos %d, end_pos %d, comprLength %d, decomprLength %d,\n compr values [0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x],\n decompr values [%d,%d,%d,%d,%d,%d,%d,%d,%d,%d] \n",
234  // blockIdx.x, query_index, min, max, (int)comprLength, (int)decomprLength,
235  // s_comprInv[0], s_comprInv[1], s_comprInv[2], s_comprInv[3], s_comprInv[4],
236  // s_comprInv[5], s_comprInv[6], s_comprInv[7], s_comprInv[8], s_comprInv[9],
237  // s_decomprInv[0], s_decomprInv[1], s_decomprInv[2], s_decomprInv[3], s_decomprInv[4],
238  // s_decomprInv[5], s_decomprInv[6], s_decomprInv[7], s_decomprInv[8], s_decomprInv[9]);
239 
240  assert(decomprLength != 0);
241 
242  // Iterate the decompressed posting lists array s_decomprIOnv in blocks of MATCH_THREADS_PER_BLOCK
243  // docsIDs, where each thread processes one docID at a time
244  for (int i = 0; i < ((int)decomprLength - 1) / MATCH_THREADS_PER_BLOCK + 1; ++i)
245  {
246  if (idx + i * MATCH_THREADS_PER_BLOCK < (int)decomprLength)
247  {
248  u32 count = 0; //for AT
249  u32 access_id = s_decomprInv[idx + i * MATCH_THREADS_PER_BLOCK];// retrieved docID from posting lists array
250 
251  if(shift_bits_subsequence != 0)
252  {
253  int __offset = access_id & (((unsigned int)1<<shift_bits_subsequence) - 1);
254  int __new_offset = __offset - order;
255  if(__new_offset >= 0)
256  {
257  access_id = access_id - __offset + __new_offset;
258  }
259  else
260  continue;
261  }
262 
263  u32 thread_threshold = *my_threshold;
264  bool key_eligible; //
265  if (bitmap_bits)
266  {
267 
268  key_eligible = false;
269  //all count are store in the bitmap, and access the count
270  count = bitmap_kernel_AT(access_id, bitmap, bitmap_bits,
271  thread_threshold, &key_eligible);
272 
273  if (!key_eligible)
274  continue; //i.e. count< thread_threshold
275  }
276 
277  key_eligible = false;
278  if (count < *my_threshold)
279  {
280  continue; //threshold has been increased, no need to insert
281  }
282 
283  //Try to find the entry in hash tables
284  bool pass_threshold; //to determine whether pass the check of my_theshold
286  access_id,
287  hash_table, hash_table_size, myb_query, count, &key_eligible,
288  my_threshold, &pass_threshold);
289 
290  if (key_eligible)
291  {
292  if (pass_threshold)
293  {
294  updateThreshold(my_passCount, my_threshold, my_topk, count);
295  }
296 
297  continue;
298  }
299 
300  if (!key_eligible)
301  {
302  //Insert the key into hash table
303  //access_id and its location are packed into a packed key
304 
305  if (count < *my_threshold)
306  {
307  continue;//threshold has been increased, no need to insert
308  }
309 
310  hash_kernel_AT(access_id, hash_table, hash_table_size, myb_query, count,
311  my_threshold, my_noiih, overflow, &pass_threshold);
312  if (*overflow)
313  {
314 
315  return;
316  }
317  if (pass_threshold)
318  {
319  updateThreshold(my_passCount, my_threshold, my_topk, count);
320  }
321  }
322 
323  }
324  }
325 }
326 
327 } // namespace matching
328 } // namespace genie
329 
330 template <class Codec> void
332  inv_compr_table& table,
333  std::vector<Query>& queries,
334  thrust::device_vector<data_t>& d_hash_table,
335  thrust::device_vector<u32>& d_bitmap,
336  int hash_table_size,
337  int bitmap_bits,
338  thrust::device_vector<u32>& d_num_of_items_in_hashtable,
339  thrust::device_vector<u32>& d_threshold,
340  thrust::device_vector<u32>& d_passCount)
341 {
342  // GPU time measuring events (kernel executions are measured using CUDA events)
343  cudaEvent_t startMatching, stopMatching;
344  cudaEvent_t startConvert, stopConvert;
345  cudaEventCreate(&startMatching);
346  cudaEventCreate(&stopMatching);
347  cudaEventCreate(&startConvert);
348  cudaEventCreate(&stopConvert);
349  // CPU time measuring
350  u64 overallStart, overallEnd;
351  u64 queryCompilationStart, queryCompilationEnd;
352  u64 preprocessingStart, preprocessingEnd;
353  u64 queryTransferStart, queryTransferEnd;
354  u64 dataTransferStart, dataTransferEnd;
355  u64 constantTransferStart, constantTransferEnd;
356  u64 allocationStart, allocationEnd;
357  u64 fillingStart, fillingEnd;
358 
359 
360 
361  Logger::log(Logger::INFO, "*** Starting matching (Integrated Compressed)...");
362  overallStart = getTime();
363 
364  // Make sure if we decompress a single lists from the table, we can fit it into shared memory
366  assert(table.build_status() == inv_table::builded);
367 
368 
369 
370  Logger::log(Logger::INFO, " Preprocessing variables for matching kernel...");
371  preprocessingStart = getTime();
372 
373  u32 shift_bits_subsequence = table._shift_bits_subsequence();
374  int bitmap_size = getBitmapSize(bitmap_bits, shift_bits_subsequence, table.i_size(), queries.size());
375  assert(bitmap_size > 0);
376  u32 max_topk = cal_max_topk(queries);
377 
378  preprocessingEnd = getTime();
379 
380 
381 
382  Logger::log(Logger::INFO, " Compiling queries...");
383  queryCompilationStart = getTime();
384 
385  vector<Query::dim> dims;
386  //number of maximum count per query
387  u32 num_of_max_count = build_compressed_queries(
388  queries, &table, dims, table.getUncompressedPostingListMaxLength());
389 
390  queryCompilationEnd = getTime();
391 
392 
393 
394  Logger::log(Logger::INFO, " Transferring queries to device...");
395  queryTransferStart = getTime();
396 
397  thrust::device_vector<Query::dim> d_dims(dims);
398 
399  queryTransferEnd = getTime();
400 
401 
402 
403  Logger::log(Logger::INFO, " Transferring inverted lists to device...");
404  dataTransferStart = getTime();
405 
406  if (table.get_total_num_of_table() > 1 || !table.is_stored_in_gpu)
407  table.cpy_data_to_gpu();
408 
409  dataTransferEnd = getTime();
410 
411 
412 
413  Logger::log(Logger::INFO, " Transferring constant symbol memory to device...");
414  constantTransferStart = getTime();
415 
416  cudaCheckErrors(cudaMemcpyToSymbol(genie::matching::d_offsets, genie::matching::h_offsets, sizeof(u32)*16, 0,
417  cudaMemcpyHostToDevice));
418  Logger::log(Logger::INFO, " Transferring offsets table (total %d bytes)", sizeof(u32)*16);
419 
420  constantTransferEnd = getTime();
421 
422 
423 
424  Logger::log(Logger::INFO, " Allocating matching memory on device...");
425  allocationStart = getTime();
426 
427  Logger::log(Logger::INFO, " Allocating threshold (total %d bytes)...", queries.size() * sizeof(u32));
428  d_threshold.resize(queries.size());
429 
430  Logger::log(Logger::INFO, " Allocating passCount (total %d bytes)...", queries.size() * num_of_max_count * sizeof(u32));
431  d_passCount.resize(queries.size()*num_of_max_count);
432 
433  Logger::log(Logger::INFO, " Allocating bitmap (total %d bytes)...", bitmap_size * sizeof(u32));
434  d_bitmap.resize(bitmap_size);
435 
436  Logger::log(Logger::INFO, " Allocating num_of_items_in_hashtable (total %d bytes)...", queries.size() * sizeof(u32));
437  d_num_of_items_in_hashtable.resize(queries.size());
438 
439  Logger::log(Logger::INFO, " Allocating d_topks (total %d bytes)...", queries.size() * sizeof(u32));
440  thrust::device_vector<u32> d_topks;
441  d_topks.resize(queries.size());
442 
443  Logger::log(Logger::INFO, " Allocating hash_table (total %d bytes)...", queries.size() * hash_table_size * sizeof(data_t));
444  d_hash_table.resize(queries.size() * hash_table_size);
445 
446  bool h_overflow[1] = {false};
447  bool * d_overflow_p;
448  Logger::log(Logger::INFO, " Allocating hash table overflow indicator (total %d bytes)...", sizeof(bool));
449  cudaCheckErrors(cudaMalloc((void**) &d_overflow_p, sizeof(bool)));
450 
451  allocationEnd = getTime();
452 
453 
454 
455  Logger::log(Logger::INFO, " Matching...");
456  for (int loop_count = 1; ;loop_count++)
457  {
458  Logger::log(Logger::INFO, " Preparing matching... (attempt %d)", loop_count);
459  Logger::log(Logger::INFO, " Filling matching memory on device...");
460  fillingStart = getTime();
461 
462  thrust::fill(d_threshold.begin(), d_threshold.end(), 1);
463  thrust::fill(d_passCount.begin(), d_passCount.end(), 0u);
464  thrust::fill(d_bitmap.begin(), d_bitmap.end(), 0u);
465  thrust::fill(d_num_of_items_in_hashtable.begin(), d_num_of_items_in_hashtable.end(), 0u);
466  thrust::fill(d_topks.begin(), d_topks.end(), max_topk);
467  thrust::fill(d_hash_table.begin(), d_hash_table.end(), data_t{0u, 0.0f});
468 
469  h_overflow[0] = false;
470  cudaCheckErrors(cudaMemcpy(d_overflow_p, h_overflow, sizeof(bool), cudaMemcpyHostToDevice));
471 
472  fillingEnd = getTime();
473 
474 
475  Logger::log(Logger::INFO, " Starting decompression & match kernel...");
476  cudaEventRecord(startMatching);
477 
478  // Call matching kernel, where each BLOCK does matching of one compiled query, only matching for the
479  // next DECOMPR_BATCH compiled queries is done in one invocation of the kernel -- this corresponds to
480  // the number of decompressed inverted lists
481  match_adaptiveThreshold_integrated<Codec><<<dims.size(),MATCH_THREADS_PER_BLOCK>>>
482  (table.m_size(),
483  (table.i_size() * ((unsigned int)1<<shift_bits_subsequence)),
484  hash_table_size, // hash table size
485  table.deviceCompressedInv(), // points to the start location of compressed posting list array in GPU mem
486  thrust::raw_pointer_cast(d_dims.data()), // compiled queries (dim structure)
487  reinterpret_cast<T_HASHTABLE*>(thrust::raw_pointer_cast(d_hash_table.data())),
488  thrust::raw_pointer_cast(d_bitmap.data()), // of bitmap_size
489  bitmap_bits,
490  thrust::raw_pointer_cast(d_topks.data()), // d_topks set to max_topk for all queries
491  thrust::raw_pointer_cast(d_threshold.data()),
492  thrust::raw_pointer_cast(d_passCount.data()), //initialized as 0, count the number of items passing one d_threshold
493  num_of_max_count,//number of maximum count per query
494  thrust::raw_pointer_cast(d_num_of_items_in_hashtable.data()), // number of integers in a hash table set to 0 for all queries
495  d_overflow_p, // bool
496  shift_bits_subsequence);
497 
498  cudaEventRecord(stopMatching);
499  cudaEventSynchronize(stopMatching);
500  cudaCheckErrors(cudaDeviceSynchronize());
501 
502  Logger::log(Logger::INFO, " Checking for hash table overflow...");
503  cudaCheckErrors(cudaMemcpy(h_overflow, d_overflow_p, sizeof(bool), cudaMemcpyDeviceToHost));
504  if(!h_overflow[0]){
505  Logger::log(Logger::INFO, " Matching succeeded");
506  break;
507  }
508 
509  Logger::log(Logger::INFO, " Matching failed (hash table overflow)");
510  hash_table_size += num_of_max_count*max_topk;
511  if(hash_table_size > table.i_size())
512  hash_table_size = table.i_size();
513 
514  d_hash_table.resize(queries.size()*hash_table_size);
515  Logger::log(Logger::INFO, " Resized hash table (now total of %d bytes)",
516  queries.size() * hash_table_size * sizeof(data_t));
517  };
518 
519  Logger::log(Logger::INFO, " Starting data conversion from hash tables......");
520  Logger::log(Logger::INFO, " Starting conversion kernel...");
521  cudaEventRecord(startConvert);
522 
523  convert_to_data<<<hash_table_size*queries.size() / 1024 + 1,1024>>>(
524  reinterpret_cast<T_HASHTABLE*>(thrust::raw_pointer_cast(d_hash_table.data())),
525  (u32)hash_table_size*queries.size());
526 
527  cudaEventRecord(stopConvert);
528  cudaCheckErrors(cudaEventSynchronize(stopConvert));
529 
530 
531  // Only deallocate manually allocated memory; thrust::device_vector will be deallocated when out of scope
532  Logger::log(Logger::INFO, " Deallocating memory......");
533  cudaCheckErrors(cudaFree(d_overflow_p));
534 
535  Logger::log(Logger::INFO, " Matching is done!");
536  overallEnd = getTime();
537 
538 
539  float matchingTime, convertTime;
540  cudaEventElapsedTime(&matchingTime, startMatching, stopMatching);
541  cudaEventElapsedTime(&convertTime, startConvert, stopConvert);
542 
544  .Compr(table.getCompression())
545  .OverallTime(getInterval(overallStart, overallEnd))
546  .QueryCompilationTime(getInterval(queryCompilationStart, queryCompilationEnd))
547  .PreprocessingTime(getInterval(preprocessingStart, preprocessingEnd))
548  .QueryTransferTime(getInterval(queryTransferStart, queryTransferEnd))
549  .DataTransferTime(getInterval(dataTransferStart, dataTransferEnd))
550  .ConstantTransferTime(getInterval(constantTransferStart, constantTransferEnd))
551  .AllocationTime(getInterval(allocationStart, allocationEnd))
552  .FillingTime(getInterval(fillingStart, fillingEnd))
553  .MatchingTime(matchingTime)
554  .ConvertTime(convertTime)
555  .InvSize(sizeof(uint32_t) * table.compressedInv()->size())
556  .DimsSize(dims.size() * sizeof(Query::dim))
557  .HashTableCapacityPerQuery(hash_table_size)
558  .ThresholdSize(queries.size() * sizeof(u32))
559  .PasscountSize(queries.size() * num_of_max_count * sizeof(u32))
560  .BitmapSize(bitmap_size * sizeof(u32))
561  .NumItemsInHashTableSize(queries.size() * sizeof(u32))
562  .TopksSize(queries.size() * sizeof(u32))
563  .HashTableSize(queries.size() * hash_table_size * sizeof(data_t));
564 }
565 
__device__ __forceinline__ void access_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, bool *key_found, u32 *my_threshold, bool *pass_threshold)
const size_t MATCH_THREADS_PER_BLOCK
Definition: match_common.h:41
template void match_integrated< DeviceBitPackingCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
#define GPUGENIE_INTEGRATED_KERNEL_SM_SIZE
void match_integrated(genie::table::inv_compr_table &table, std::vector< genie::query::Query > &queries, thrust::device_vector< genie::matching::data_t > &d_data, thrust::device_vector< u32 > &d_bitmap, int hash_table_size, int bitmap_bits, thrust::device_vector< u32 > &d_noiih, thrust::device_vector< u32 > &d_threshold, thrust::device_vector< u32 > &d_passCount)
This is the top-level namespace of the project.
__device__ __forceinline__ void hash_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, u32 *my_threshold, u32 *my_noiih, bool *overflow, bool *pass_threshold)
unsigned long long getTime()
Get system time.
Definition: Timing.cc:22
template void match_integrated< DeviceCopyCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
std::vector< uint32_t > * compressedInv()
genie::compression::COMPRESSION_TYPE getCompression() const
__global__ void convert_to_data(T_HASHTABLE *table, u32 size)
Definition: match_common.cu:10
int build_compressed_queries(vector< Query > &queries, inv_compr_table *ctable, vector< Query::dim > &dims, int max_load)
int get_total_num_of_table() const
return the total_num_of_table.
Definition: inv_table.cu:225
__global__ void match_adaptiveThreshold_integrated(int m_size, int i_size, int hash_table_size, uint32_t *d_compr_inv, Query::dim *d_dims, T_HASHTABLE *hash_table_list, u32 *bitmap_list, int bitmap_bits, u32 *d_topks, u32 *d_threshold, u32 *d_passCount, u32 num_of_max_count, u32 *noiih, bool *overflow, unsigned int shift_bits_subsequence)
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p.
int cal_max_topk(vector< Query > &queries)
Definition: match.cu:212
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
Definition: inv_table.h:75
template void match_integrated< DeviceDeltaCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
unsigned int _shift_bits_subsequence()
Definition: inv_table.cu:202
size_t getUncompressedPostingListMaxLength() const
Record run-time information.
__device__ __forceinline__ u32 bitmap_kernel_AT(u32 access_id, u32 *bitmap, int bits, int my_threshold, bool *key_eligible)
uint32_t u32
Definition: match_common.h:18
unsigned long long u64
A type definition for a 64-bit unsigned integer.
Definition: match_common.h:19
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
Definition: inv_table.h:70
The second-step struct for processing queries.
Definition: query.h:59
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end.
Definition: Timing.cc:36
Functions about getting system time.
template void match_integrated< DeviceVarintCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
Basic utility functions to be used in matching kernels.
u64 T_HASHTABLE
Definition: match_common.h:21
uint32_t * deviceCompressedInv() const
__device__ __forceinline__ void updateThreshold(u32 *my_passCount, u32 *my_threshold, u32 my_topk, u32 count)
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.
Definition: cuda_macros.h:23
int getBitmapSize(int &in_out_bitmap_bits, u32 in_shift_bits_subsequence, int in_number_of_data_points, int in_queries_size)