GENIE
match.cu
Go to the documentation of this file.
1 
5 #include <algorithm>
6 #include <iomanip>
7 #include <iostream>
8 #include <stdlib.h>
9 #include <string>
10 #include <sstream>
11 #include <math.h>
12 #include <algorithm>
13 
14 #include <thrust/copy.h>
15 #include <thrust/device_vector.h>
16 
18 #include <genie/utility/Logger.h>
20 #include <genie/utility/Timing.h>
24 
25 #include "match.h"
26 #include "match_common.h"
27 #include "match_device_utils.h"
28 
29 using namespace genie::matching;
30 using namespace genie::query;
31 using namespace genie::table;
32 using namespace genie::utility;
33 using namespace std;
34 using namespace thrust;
35 
36 namespace genie
37 {
38 namespace matching
39 {
40 
41 
42 //for AT: for adaptiveThreshold match function for adaptiveThreshold
43 __global__
44 void match_AT(int m_size, int i_size, int hash_table_size,
45  int* d_inv, Query::dim* d_dims,
46  T_HASHTABLE* hash_table_list, u32 * bitmap_list, int bitmap_bits,
47  u32* d_topks, u32* d_threshold, //initialized as 1, and increase gradually
48  u32* d_passCount, //initialized as 0, count the number of items passing one d_threshold
49  u32 num_of_max_count, u32 * noiih, bool * overflow, unsigned int shift_bits_subsequence)
50 {
51  if (m_size == 0 || i_size == 0)
52  return;
53  Query::dim& q = d_dims[blockIdx.x];
54  int query_index = q.query;
55  u32* my_noiih = &noiih[query_index];
56  u32* my_threshold = &d_threshold[query_index];
57  u32* my_passCount = &d_passCount[query_index * num_of_max_count]; //
58  u32 my_topk = d_topks[query_index]; //for AT
59 
60  T_HASHTABLE* hash_table = &hash_table_list[query_index * hash_table_size];
61  u32 * bitmap;
62  if (bitmap_bits)
63  bitmap = &bitmap_list[query_index * (i_size / (32 / bitmap_bits) + 1)];
64  u32 access_id;
65  int min, max, order;
66  if(q.start_pos >= q.end_pos)
67  return;
68 
69  min = q.start_pos;
70  max = q.end_pos;
71  order = q.order;
72  bool key_eligible; //
73  bool pass_threshold; //to determine whether pass the check of my_theshold
74 
75  for (int i = 0; i < (max - min - 1) / MATCH_THREADS_PER_BLOCK + 1; ++i)
76  {
77 
78  int tmp_id = threadIdx.x + i * MATCH_THREADS_PER_BLOCK + min;
79  if (tmp_id < max)
80  {
81  u32 count = 0; //for AT
82  access_id = d_inv[tmp_id];
83 
84  if(shift_bits_subsequence != 0)
85  {
86  int __offset = access_id & (((unsigned int)1<<shift_bits_subsequence) - 1);
87  int __new_offset = __offset - order;
88  if(__new_offset >= 0)
89  {
90  access_id = access_id - __offset + __new_offset;
91  }
92  else
93  continue;
94  }
95 
96  u32 thread_threshold = *my_threshold;
97  assert(thread_threshold < gridDim.x);
98 
99  if (bitmap_bits)
100  {
101 
102  key_eligible = false;
103  //all count are store in the bitmap, and access the count
104  count = bitmap_kernel_AT(access_id, bitmap, bitmap_bits,
105  thread_threshold, &key_eligible);
106 
107  if (!key_eligible)
108  continue; //i.e. count< thread_threshold
109  }
110 
111  key_eligible = false;
112  if (count < *my_threshold)
113  {
114  continue; //threshold has been increased, no need to insert
115  }
116 
117  //Try to find the entry in hash tables
119  access_id,
120  hash_table, hash_table_size, q, count, &key_eligible,
121  my_threshold, &pass_threshold);
122 
123  if (key_eligible)
124  {
125  if (pass_threshold)
126  {
127  updateThreshold(my_passCount, my_threshold, my_topk, count);
128  }
129 
130  continue;
131  }
132 
133  if (!key_eligible)
134  {
135  //Insert the key into hash table
136  //access_id and its location are packed into a packed key
137 
138  if (count < *my_threshold)
139  {
140  continue;//threshold has been increased, no need to insert
141  }
142 
143  hash_kernel_AT(access_id, hash_table, hash_table_size, q, count,
144  my_threshold, my_noiih, overflow, &pass_threshold);
145  if (*overflow)
146  {
147 
148  return;
149  }
150  if (pass_threshold)
151  {
152  updateThreshold(my_passCount, my_threshold, my_topk, count);
153  }
154  }
155 
156  }
157  }
158 }
159 //end for AT
160 
161 int build_queries(vector<Query>& queries, inv_table& table,
162  vector<Query::dim>& dims, int max_load)
163 {
164  try{
165  u64 query_build_start, query_build_stop;
166  query_build_start = getTime();
167 
168  int max_count = -1;
169  for (unsigned int i = 0; i < queries.size(); ++i)
170  {
171  if (queries[i].ref_table() != &table)
172  throw genie::exception::cpu_runtime_error("Can't build queries. Queries constructed for different table!");
173  if (table.build_status() == inv_table::builded)
174  {
175  if(table.shift_bits_sequence != 0)
176  {
177  queries[i].build_sequence();// For sequence, balance have not been done
178  } else if (queries[i].use_load_balance)
179  {
180  queries[i].build_and_apply_load_balance(max_load);
181  }else
182  {
183  queries[i].build();
184  }
185  }
186 
187  int prev_size = dims.size();
188  queries[i].dump(dims);
189 
190  int count = dims.size() - prev_size;
191 
192  if(count > max_count) max_count = count;
193  }
194 
195  query_build_stop = getTime();
196  Logger::log(Logger::INFO, ">>>>[time profiling]: match: build_queries function takes %f ms. ",
197  getInterval(query_build_start, query_build_stop));
198 
199  Logger::log(Logger::DEBUG, " dims size: %d.", dims.size());
200 
201  return max_count;
202 
203  } catch(std::bad_alloc &e){
204  throw genie::exception::cpu_bad_alloc(e.what());
206  throw e;
207  } catch(std::exception &e){
208  throw genie::exception::cpu_runtime_error(e.what());
209  }
210 }
211 
212 int cal_max_topk(vector<Query>& queries)
213 {
214  int max_topk = 0;
215  for(vector<Query>::iterator it = queries.begin(); it != queries.end(); ++it)
216  {
217  if(it->topk() > max_topk) max_topk = it->topk();
218  }
219  return max_topk;
220 }
221 
222 
223 
224 void match(inv_table& table, vector<Query>& queries,
225  device_vector<data_t>& d_data, device_vector<u32>& d_bitmap,
226  int hash_table_size, int max_load, int bitmap_bits, //or for AT: for adaptiveThreshold, if bitmap_bits<0, use adaptive threshold, the absolute value of bitmap_bits is count value stored in the bitmap
227  device_vector<u32>& d_noiih, device_vector<u32>& d_threshold, device_vector<u32>& d_passCount)
228 {
229  try{
230  u32 shift_bits_subsequence = table._shift_bits_subsequence();
231 
232  if (table.build_status() == inv_table::not_builded)
233  throw genie::exception::cpu_runtime_error("table not built!");
234 
235  // Time measuring events
236  cudaEvent_t kernel_start, kernel_stop;
237  cudaEventCreate(&kernel_start);
238  cudaEventCreate(&kernel_stop);
239  u64 match_stop, match_start;
240  match_start = getTime();
241 
242  Logger::log(Logger::INFO, "[ 0%] Starting matching...");
243 
244  u32 num_of_max_count=0, max_topk=0;
245  u32 loop_count = 1u;
246  d_noiih.resize(queries.size(), 0);
247  u32 * d_noiih_p = thrust::raw_pointer_cast(d_noiih.data());
248 
249  vector<Query::dim> dims;
250 
251  Logger::log(Logger::DEBUG, "hash table size: %d.", hash_table_size);
252  u64 match_query_start, match_query_end;
253  match_query_start = getTime();
254  num_of_max_count = build_queries(queries, table, dims, max_load);
255 
256  match_query_end = getTime();
257  Logger::log(Logger::INFO,
258  ">>>>[time profiling]: match: build_queries function takes %f ms. ",
259  getInterval(match_query_start, match_query_end));
260  Logger::log(Logger::DEBUG, " dims size: %d.",
261  dims.size());
262 
263  //for AT: for adaptiveThreshold, enable adaptiveThreshold
264  if (bitmap_bits < 0)
265  {
266  bitmap_bits = -bitmap_bits;
267  //for hash_table_size, still let it determine by users currently
268  }
269 
270  Logger::log(Logger::DEBUG,
271  "[info] bitmap_bits:%d.",
272  bitmap_bits);
273 
274  //end for AT
275 
276  int threshold = bitmap_bits - 1, bitmap_size = 0;
277  if (bitmap_bits > 1)
278  {
279  float logresult = std::log2((float) bitmap_bits);
280  bitmap_bits = (int) logresult;
281  if (logresult - bitmap_bits > 0)
282  {
283  bitmap_bits += 1;
284  }
285  logresult = std::log2((float) bitmap_bits);
286  bitmap_bits = (int) logresult;
287  if (logresult - bitmap_bits > 0)
288  {
289  bitmap_bits += 1;
290  }
291  bitmap_bits = pow(2, bitmap_bits);
292  bitmap_size = ((((unsigned int)1<<shift_bits_subsequence) * table.i_size()) / (32 / bitmap_bits) + 1)
293  * queries.size();
294  }
295  else
296  {
297  bitmap_bits = threshold = 0;
298  }
299 
300  Logger::log(Logger::DEBUG, "Bitmap bits: %d, threshold:%d.", bitmap_bits,
301  threshold);
302  Logger::log(Logger::INFO, "[ 20%] Declaring device memory...");
303 
304  d_bitmap.resize(bitmap_size);
305 
306 
307  cout << "query_transfer time = " ;
308  u64 query_start = getTime();
309 
310  device_vector<Query::dim> d_dims(dims);
311  Query::dim* d_dims_p = raw_pointer_cast(d_dims.data());
312 
313  u64 query_end = getTime();
314  cout << getInterval(query_start, query_end) << "ms." << endl;
315 
316  u64 dataTransferStart, dataTransferEnd;
317  dataTransferStart = getTime();
318  if (table.get_total_num_of_table() > 1 || !table.is_stored_in_gpu)
319  {
320  table.cpy_data_to_gpu();
321  }
322  dataTransferEnd = getTime();
323 
324  if (bitmap_size)
325  {
326  thrust::fill(d_bitmap.begin(), d_bitmap.end(), 0u);
327  }
328  u32 * d_bitmap_p = raw_pointer_cast(d_bitmap.data());
329 
330 
331  Logger::log(Logger::INFO, "[ 30%] Allocating device memory to tables...");
332 
333  data_t nulldata;
334  nulldata.id = 0u;
335  nulldata.aggregation = 0.0f;
336  T_HASHTABLE* d_hash_table;
337  data_t* d_data_table;
338  d_data.clear();
339 
340  d_data.resize(queries.size() * hash_table_size, nulldata);
341  d_data_table = thrust::raw_pointer_cast(d_data.data());
342  d_hash_table = reinterpret_cast<T_HASHTABLE*>(d_data_table);
343 
344  Logger::log(Logger::INFO, "[ 33%] Copying memory to symbol...");
345 
346  cudaCheckErrors(cudaMemcpyToSymbol(d_offsets, h_offsets, sizeof(u32)*16, 0, cudaMemcpyHostToDevice));
347 
348  Logger::log(Logger::INFO,"[ 40%] Starting match kernels...");
349  cudaEventRecord(kernel_start);
350 
351  bool h_overflow[1] = {false};
352  bool * d_overflow;
353 
354  cudaCheckErrors(cudaMalloc((void**) &d_overflow, sizeof(bool)));
355 
356  do
357  {
358  h_overflow[0] = false;
359  cudaCheckErrors(cudaMemcpy(d_overflow, h_overflow, sizeof(bool), cudaMemcpyHostToDevice));
360  d_threshold.resize(queries.size());
361  thrust::fill(d_threshold.begin(), d_threshold.end(), 1);
362  u32 * d_threshold_p = thrust::raw_pointer_cast(d_threshold.data());
363 
364  //which num_of_max_count should be used?
365 
366  //num_of_max_count = dims.size();
367 
368  d_passCount.resize(queries.size()*num_of_max_count);
369  thrust::fill(d_passCount.begin(), d_passCount.end(), 0u);
370  u32 * d_passCount_p = thrust::raw_pointer_cast(d_passCount.data());
371  max_topk = cal_max_topk(queries);
372  device_vector<u32> d_topks;
373  d_topks.resize(queries.size());
374  thrust::fill(d_topks.begin(), d_topks.end(), max_topk);
375  u32 * d_topks_p = thrust::raw_pointer_cast(d_topks.data());
376 
377 
378  match_AT<<<dims.size(), MATCH_THREADS_PER_BLOCK>>>
379  (table.m_size(),
380  table.i_size() * ((unsigned int)1<<shift_bits_subsequence),
381  hash_table_size,
382  table.d_inv_p,
383  d_dims_p,
384  d_hash_table,
385  d_bitmap_p,
386  bitmap_bits,
387  d_topks_p,
388  d_threshold_p,//initialized as 1, and increase gradually
389  d_passCount_p,//initialized as 0, count the number of items passing one d_threshold
390  num_of_max_count,//number of maximum count per query
391  d_noiih_p,
392  d_overflow,
393  shift_bits_subsequence);
394  cudaCheckErrors(cudaDeviceSynchronize());
395  cudaCheckErrors(cudaMemcpy(h_overflow, d_overflow, sizeof(bool), cudaMemcpyDeviceToHost));
396 
397  if(h_overflow[0])
398  {
399  hash_table_size += num_of_max_count*max_topk;
400  if(hash_table_size > table.i_size())
401  {
402  hash_table_size = table.i_size();
403  }
404  thrust::fill(d_noiih.begin(), d_noiih.end(), 0u);
405  if(bitmap_size)
406  {
407  thrust::fill(d_bitmap.begin(), d_bitmap.end(), 0u);
408  }
409  d_data.resize(queries.size()*hash_table_size);
410  thrust::fill(d_data.begin(), d_data.end(), nulldata);
411  d_data_table = thrust::raw_pointer_cast(d_data.data());
412  d_hash_table = reinterpret_cast<T_HASHTABLE*>(d_data_table);
413  }
414 
415  if (loop_count>1 || (loop_count == 1 && h_overflow[0]))
416  {
417  Logger::log(Logger::INFO,"%d time trying to launch match kernel: %s!", loop_count, h_overflow[0]?"failed":"succeeded");
418  }
419  loop_count ++;
420 
421  } while (h_overflow[0]);
422 
423  cudaCheckErrors(cudaFree(d_overflow));
424 
425  cudaEventRecord(kernel_stop);
426  Logger::log(Logger::INFO,"[ 90%] Starting data converting......");
427 
428  convert_to_data<<<hash_table_size*queries.size() / 1024 + 1,1024>>>(d_hash_table,(u32)hash_table_size*queries.size());
429 
430  Logger::log(Logger::INFO, "[100%] Matching is done!");
431 
432  match_stop = getTime();
433 
434  cudaEventSynchronize(kernel_stop);
435  float kernel_elapsed = 0.0f;
436  cudaEventElapsedTime(&kernel_elapsed, kernel_start, kernel_stop);
437  Logger::log(Logger::INFO,
438  ">>>>[time profiling]: Match kernel takes %f ms. (GPU running) ",
439  kernel_elapsed);
440  Logger::log(Logger::INFO,
441  ">>>>[time profiling]: Match function takes %f ms. (including Match kernel, GPU+CPU part)",
442  getInterval(match_start, match_stop));
443  Logger::log(Logger::VERBOSE, ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
444 
446  .OverallTime(getInterval(match_start, match_stop))
447  .QueryCompilationTime(getInterval(match_query_start, match_query_end))
448  .QueryTransferTime(getInterval(query_start, query_end))
449  .DataTransferTime(getInterval(dataTransferStart, dataTransferEnd))
450  .MatchingTime(kernel_elapsed)
451  .InvSize(sizeof(int) * table.inv()->size())
452  .DimsSize(dims.size() * sizeof(Query::dim))
453  .HashTableCapacityPerQuery(hash_table_size)
454  .ThresholdSize(queries.size() * sizeof(u32))
455  .PasscountSize(queries.size() * num_of_max_count * sizeof(u32))
456  .BitmapSize(bitmap_size * sizeof(u32))
457  .NumItemsInHashTableSize(queries.size() * sizeof(u32))
458  .TopksSize(queries.size() * sizeof(u32))
459  .HashTableSize(queries.size() * hash_table_size * sizeof(data_t));
460 
461  } catch(std::bad_alloc &e){
462  throw genie::exception::gpu_bad_alloc(e.what());
463  }
464 }
465 
466 
467 // debug use
468 static int build_queries_direct(vector<genie::query::Query> &queries, genie::table::inv_table &table, vector<genie::query::Query::dim> &dims)
469 {
470  try
471  {
472  int max_count = -1;
473  for (size_t i = 0; i < queries.size(); ++i)
474  {
475  if (queries[i].ref_table() != &table)
476  throw genie::exception::cpu_runtime_error("table not built");
477  int prev_size = dims.size();
479  queries[i].build(dims); // overloaded
480  int count = dims.size() - prev_size;
481  if (count > max_count)
482  max_count = count;
483  }
484 
485  return max_count;
486  }
487  catch (std::bad_alloc &e)
488  {
489  throw genie::exception::cpu_bad_alloc(e.what());
490  }
492  {
493  throw e;
494  }
495  catch (std::exception &e)
496  {
497  throw genie::exception::cpu_runtime_error(e.what());
498  }
499 }
500 
501 void
502 match_MT(vector<inv_table*>& table, vector<vector<Query> >& queries,
503  vector<device_vector<data_t> >& d_data, vector<device_vector<u32> >& d_bitmap,
504  vector<int>& hash_table_size, vector<int>& max_load, int bitmap_bits,
505  vector<device_vector<u32> >& d_noiih, vector<device_vector<u32> >& d_threshold,
506  vector<device_vector<u32> >& d_passCount, size_t start, size_t finish)
507 {
508  try
509  {
510  /* timing */
511  u64 match_stop, match_start;
512  cudaEvent_t kernel_start, kernel_stop;
513  float kernel_elapsed;
514  cudaEventCreate(&kernel_start);
515  cudaEventCreate(&kernel_stop);
516  match_start = getTime();
517  Logger::log(Logger::INFO, "[ 0%] Starting matching...");
518 
519  /* variable declaration */
520  u32 shift_bits_subsequence = table.at(0)->_shift_bits_subsequence();
521  vector<vector<Query::dim> > dims(table.size()); /* Query::dim on CPU */
522  vector<device_vector<Query::dim> > d_dims(table.size()); /* Query::dim on GPU */
523  vector<Query::dim*> d_dims_p(table.size()); /* Query::dim pointers */
524  vector<u32*> d_noiih_p(table.size());
525  vector<u32> num_of_max_count(table.size(), 0);
526  vector<u32> max_topk(table.size(), 0);
527  vector<u32*> d_bitmap_p(table.size());
528  vector<bool*> d_overflow(table.size());
529  vector<T_HASHTABLE*> d_hash_table(table.size());
530  vector<u32*> d_threshold_p(table.size());
531  vector<u32*> d_passCount_p(table.size());
532  vector<device_vector<u32> > d_topks(table.size());
533  vector<u32*> d_topks_p(table.size());
534  vector<int> threshold(table.size(), bitmap_bits - 1);
535  vector<int> bitmap_size(table.size(), 0);
536  data_t nulldata = {0u, 0.0f};
537  data_t* d_data_table;
538 
539  /* adaptive threshold */
540  if (bitmap_bits < 0)
541  //for hash_table_size, still let it determine by users currently
542  bitmap_bits = -bitmap_bits;
543  Logger::log(Logger::DEBUG,
544  "[info] bitmap_bits:%d.",
545  bitmap_bits);
546  int bitmap_bits_copy = bitmap_bits;
547 
548  /* table dependent variable pre-processing */
549  for (size_t i = start; i < finish; ++i)
550  {
551  if (queries.at(i).empty())
552  continue;
553  if (table.at(i)->build_status() == inv_table::not_builded)
554  throw genie::exception::cpu_runtime_error("table not built!");
555 
556  /* bitmap */
557  bitmap_bits = bitmap_bits_copy;
558  if (bitmap_bits > 1)
559  {
560  float logresult = std::log2((float) bitmap_bits);
561  bitmap_bits = (int) logresult;
562  if (logresult - bitmap_bits > 0)
563  bitmap_bits += 1;
564  logresult = std::log2((float) bitmap_bits);
565  bitmap_bits = (int) logresult;
566  if (logresult - bitmap_bits > 0)
567  bitmap_bits += 1;
568  bitmap_bits = pow(2, bitmap_bits);
569  bitmap_size[i] = ((((unsigned int)1<<shift_bits_subsequence) * table.at(i)->i_size()) / (32 / bitmap_bits) + 1)
570  * queries.at(i).size();
571  }
572  else
573  bitmap_bits = threshold[i] = 0;
574 
575  Logger::log(Logger::DEBUG, "[ 20%] Declaring device memory...");
576  d_bitmap.at(i).resize(bitmap_size.at(i));
577  d_bitmap_p[i] = thrust::raw_pointer_cast(d_bitmap.at(i).data());
578 
579  /* number of items in hashtable */
580  d_noiih.at(i).resize(queries.at(i).size(), 0u);
581  d_noiih_p[i] = thrust::raw_pointer_cast(d_noiih.at(i).data());
582  Logger::log(Logger::DEBUG, "hash table size: %d.", hash_table_size.at(i));
583 
584  /* build query */
585  u64 match_query_start, match_query_end;
586  match_query_start = getTime();
587  num_of_max_count[i] = build_queries_direct(queries.at(i), table.at(i)[0], dims.at(i));
588  //num_of_max_count[i] = build_q(queries.at(i), table.at(i)[0], dims.at(i), max_load.at(i));
589  match_query_end = getTime();
590  Logger::log(Logger::DEBUG,
591  ">>>>[time profiling]: match: build_queries function takes %f ms. ",
592  getInterval(match_query_start, match_query_end));
593  Logger::log(Logger::DEBUG, " dims size: %d.",
594  dims.at(i).size());
595 
596  /* transfer query */
597  u64 query_start = getTime();
598  d_dims[i] = dims.at(i);
599  //vector<query::dim>().swap(dims.at(i));
600  d_dims_p[i] = raw_pointer_cast(d_dims.at(i).data());
601  u64 query_end = getTime();
602  //clog << "query_transfer time = " << getInterval(query_start, query_end) << "ms." << endl;
603 
604  Logger::log(Logger::DEBUG, "[ 30%] Allocating device memory to tables...");
605 
606  /* hashtable */
607  d_data.at(i).clear();
608  d_data.at(i).resize(queries.at(i).size() * hash_table_size.at(i), nulldata);
609  d_data_table = thrust::raw_pointer_cast(d_data.at(i).data());
610  d_hash_table[i] = reinterpret_cast<T_HASHTABLE*>(d_data_table);
611 
612  /* overflow */
613  bool f = false;
614  cudaCheckErrors(cudaMalloc((void**)&d_overflow[i], sizeof(bool)));
615  cudaCheckErrors(cudaMemcpy(d_overflow[i], &f, sizeof(bool), cudaMemcpyHostToDevice));
616 
617  /* threshold */
618  d_threshold.at(i).resize(queries.at(i).size());
619  thrust::fill(d_threshold.at(i).begin(), d_threshold.at(i).end(), 1);
620  d_threshold_p[i] = thrust::raw_pointer_cast(d_threshold.at(i).data());
621 
622  /* zipper array */
623  d_passCount.at(i).resize(queries.at(i).size() * num_of_max_count.at(i));
624  thrust::fill(d_passCount.at(i).begin(), d_passCount.at(i).end(), 0u);
625  d_passCount_p[i] = thrust::raw_pointer_cast(d_passCount.at(i).data());
626 
627  /* topk */
628  max_topk[i] = cal_max_topk(queries.at(i));
629  d_topks.at(i).resize(queries.at(i).size());
630  thrust::fill(d_topks.at(i).begin(), d_topks.at(i).end(), max_topk[i]);
631  d_topks_p[i] = thrust::raw_pointer_cast(d_topks.at(i).data());
632  }
633 
634  /* offset */
635  Logger::log(Logger::INFO, "[ 33%] Copying memory to symbol...");
636 
637  cudaCheckErrors(cudaMemcpyToSymbol(d_offsets, h_offsets, sizeof(u32)*16, 0, cudaMemcpyHostToDevice));
638 
639 
640  /* match kernel */
641  Logger::log(Logger::INFO,"[ 40%] Starting match kernels...");
642  cudaEventRecord(kernel_start);
643  for (size_t i = start; i < finish; ++i)
644  {
645  if (queries.at(i).empty())
646  continue;
647  match_AT<<<dims.at(i).size(), MATCH_THREADS_PER_BLOCK>>>(
648  table.at(i)->m_size(),
649  table.at(i)->i_size() * ((unsigned int)1<<shift_bits_subsequence),
650  hash_table_size.at(i),
651  table.at(i)->d_inv_p,
652  d_dims_p.at(i),
653  d_hash_table.at(i),
654  d_bitmap_p.at(i),
655  bitmap_bits,
656  d_topks_p.at(i),
657  d_threshold_p.at(i), //initialized as 1, and increase gradually
658  d_passCount_p.at(i), //initialized as 0, count the number of items passing one d_threshold
659  num_of_max_count.at(i), //number of maximum count per query
660  d_noiih_p.at(i),
661  d_overflow.at(i),
662  shift_bits_subsequence);
663  }
664  cudaCheckErrors(cudaDeviceSynchronize());
665 
666  /* clean up */
667  for (size_t i = start; i < finish; ++i)
668  cudaCheckErrors(cudaFree(d_overflow.at(i)));
669 
670  cudaEventRecord(kernel_stop);
671  Logger::log(Logger::INFO,"[ 90%] Starting data converting......");
672 
673  for (size_t i = start; i < finish; ++i)
674  convert_to_data<<<hash_table_size.at(i) * queries.at(i).size() / 1024 + 1, 1024>>>(d_hash_table.at(i), (u32)hash_table_size.at(i)*queries.at(i).size());
675 
676  Logger::log(Logger::INFO, "[100%] Matching is done!");
677 
678  match_stop = getTime();
679  cudaEventSynchronize(kernel_stop);
680 
681  kernel_elapsed = 0.0f;
682  cudaEventElapsedTime(&kernel_elapsed, kernel_start, kernel_stop);
683  Logger::log(Logger::INFO,
684  ">>>>[time profiling]: Match kernel takes %f ms. (GPU running) ",
685  kernel_elapsed);
686  Logger::log(Logger::INFO,
687  ">>>>[time profiling]: Match function takes %f ms. (including Match kernel, GPU+CPU part)",
688  getInterval(match_start, match_stop));
689  Logger::log(Logger::VERBOSE, ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
690  } catch(std::bad_alloc &e){
691  throw genie::exception::gpu_bad_alloc(e.what());
692  }
693 }
694 
695 }
696 }
__device__ __forceinline__ void access_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, bool *key_found, u32 *my_threshold, bool *pass_threshold)
const size_t MATCH_THREADS_PER_BLOCK
Definition: match_common.h:41
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p.
Definition: inv_table.cu:30
This is the top-level namespace of the project.
int * d_inv_p
d_inv_p points to the start location for posting list array in GPU memory.
Definition: inv_table.h:55
__device__ __forceinline__ void hash_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, u32 *my_threshold, u32 *my_noiih, bool *overflow, bool *pass_threshold)
unsigned long long getTime()
Get system time.
Definition: Timing.cc:22
The declaration for class inv_table.
Definition: inv_table.h:41
void match_MT(vector< inv_table *> &table, vector< vector< Query > > &queries, vector< device_vector< data_t > > &d_data, vector< device_vector< u32 > > &d_bitmap, vector< int > &hash_table_size, vector< int > &max_load, int bitmap_bits, vector< device_vector< u32 > > &d_noiih, vector< device_vector< u32 > > &d_threshold, vector< device_vector< u32 > > &d_passCount, size_t start, size_t finish)
Definition: match.cu:502
__global__ void convert_to_data(T_HASHTABLE *table, u32 size)
Definition: match_common.cu:10
int get_total_num_of_table() const
return the total_num_of_table.
Definition: inv_table.cu:225
int cal_max_topk(vector< Query > &queries)
Definition: match.cu:212
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
Definition: inv_table.h:75
unsigned int _shift_bits_subsequence()
Definition: inv_table.cu:202
This file includes interfaces of original GENIE match functions.
virtual std::vector< int > * inv()
Definition: inv_table.cu:248
Record run-time information.
int build_queries(vector< Query > &queries, inv_table &table, vector< Query::dim > &dims, int max_load)
Definition: match.cu:161
__device__ __forceinline__ u32 bitmap_kernel_AT(u32 access_id, u32 *bitmap, int bits, int my_threshold, bool *key_eligible)
__global__ void match_AT(int m_size, int i_size, int hash_table_size, int *d_inv, Query::dim *d_dims, T_HASHTABLE *hash_table_list, u32 *bitmap_list, int bitmap_bits, u32 *d_topks, u32 *d_threshold, u32 *d_passCount, u32 num_of_max_count, u32 *noiih, bool *overflow, unsigned int shift_bits_subsequence)
Definition: match.cu:44
uint32_t u32
Definition: match_common.h:18
unsigned long long u64
A type definition for a 64-bit unsigned integer.
Definition: match_common.h:19
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
Definition: inv_table.h:70
The second-step struct for processing queries.
Definition: query.h:59
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end.
Definition: Timing.cc:36
Functions about getting system time.
Basic utility functions to be used in matching kernels.
u64 T_HASHTABLE
Definition: match_common.h:21
void match(inv_table &table, vector< Query > &queries, device_vector< data_t > &d_data, device_vector< u32 > &d_bitmap, int hash_table_size, int max_load, int bitmap_bits, device_vector< u32 > &d_noiih, device_vector< u32 > &d_threshold, device_vector< u32 > &d_passCount)
Definition: match.cu:224
__device__ __forceinline__ void updateThreshold(u32 *my_passCount, u32 *my_threshold, u32 my_topk, u32 count)
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.
Definition: cuda_macros.h:23