14 #include <thrust/copy.h> 15 #include <thrust/device_vector.h> 44 void match_AT(
int m_size,
int i_size,
int hash_table_size,
47 u32* d_topks,
u32* d_threshold,
49 u32 num_of_max_count,
u32 * noiih,
bool * overflow,
unsigned int shift_bits_subsequence)
51 if (m_size == 0 || i_size == 0)
54 int query_index = q.
query;
55 u32* my_noiih = &noiih[query_index];
56 u32* my_threshold = &d_threshold[query_index];
57 u32* my_passCount = &d_passCount[query_index * num_of_max_count];
58 u32 my_topk = d_topks[query_index];
60 T_HASHTABLE* hash_table = &hash_table_list[query_index * hash_table_size];
63 bitmap = &bitmap_list[query_index * (i_size / (32 / bitmap_bits) + 1)];
82 access_id = d_inv[tmp_id];
84 if(shift_bits_subsequence != 0)
86 int __offset = access_id & (((
unsigned int)1<<shift_bits_subsequence) - 1);
87 int __new_offset = __offset - order;
90 access_id = access_id - __offset + __new_offset;
96 u32 thread_threshold = *my_threshold;
97 assert(thread_threshold < gridDim.x);
102 key_eligible =
false;
105 thread_threshold, &key_eligible);
111 key_eligible =
false;
112 if (count < *my_threshold)
120 hash_table, hash_table_size, q, count, &key_eligible,
121 my_threshold, &pass_threshold);
138 if (count < *my_threshold)
144 my_threshold, my_noiih, overflow, &pass_threshold);
162 vector<Query::dim>& dims,
int max_load)
165 u64 query_build_start, query_build_stop;
169 for (
unsigned int i = 0; i < queries.size(); ++i)
171 if (queries[i].ref_table() != &table)
177 queries[i].build_sequence();
178 }
else if (queries[i].use_load_balance)
180 queries[i].build_and_apply_load_balance(max_load);
187 int prev_size = dims.size();
188 queries[i].dump(dims);
190 int count = dims.size() - prev_size;
192 if(count > max_count) max_count = count;
196 Logger::log(Logger::INFO,
">>>>[time profiling]: match: build_queries function takes %f ms. ",
199 Logger::log(Logger::DEBUG,
" dims size: %d.", dims.size());
203 }
catch(std::bad_alloc &e){
207 }
catch(std::exception &e){
215 for(vector<Query>::iterator it = queries.begin(); it != queries.end(); ++it)
217 if(it->topk() > max_topk) max_topk = it->topk();
225 device_vector<data_t>& d_data, device_vector<u32>& d_bitmap,
226 int hash_table_size,
int max_load,
int bitmap_bits,
227 device_vector<u32>& d_noiih, device_vector<u32>& d_threshold, device_vector<u32>& d_passCount)
236 cudaEvent_t kernel_start, kernel_stop;
237 cudaEventCreate(&kernel_start);
238 cudaEventCreate(&kernel_stop);
239 u64 match_stop, match_start;
242 Logger::log(Logger::INFO,
"[ 0%] Starting matching...");
244 u32 num_of_max_count=0, max_topk=0;
246 d_noiih.resize(queries.size(), 0);
247 u32 * d_noiih_p = thrust::raw_pointer_cast(d_noiih.data());
249 vector<Query::dim> dims;
251 Logger::log(Logger::DEBUG,
"hash table size: %d.", hash_table_size);
252 u64 match_query_start, match_query_end;
254 num_of_max_count =
build_queries(queries, table, dims, max_load);
257 Logger::log(Logger::INFO,
258 ">>>>[time profiling]: match: build_queries function takes %f ms. ",
260 Logger::log(Logger::DEBUG,
" dims size: %d.",
266 bitmap_bits = -bitmap_bits;
270 Logger::log(Logger::DEBUG,
271 "[info] bitmap_bits:%d.",
276 int threshold = bitmap_bits - 1, bitmap_size = 0;
279 float logresult = std::log2((
float) bitmap_bits);
280 bitmap_bits = (int) logresult;
281 if (logresult - bitmap_bits > 0)
285 logresult = std::log2((
float) bitmap_bits);
286 bitmap_bits = (int) logresult;
287 if (logresult - bitmap_bits > 0)
291 bitmap_bits = pow(2, bitmap_bits);
292 bitmap_size = ((((
unsigned int)1<<shift_bits_subsequence) * table.
i_size()) / (32 / bitmap_bits) + 1)
297 bitmap_bits = threshold = 0;
300 Logger::log(Logger::DEBUG,
"Bitmap bits: %d, threshold:%d.", bitmap_bits,
302 Logger::log(Logger::INFO,
"[ 20%] Declaring device memory...");
304 d_bitmap.resize(bitmap_size);
307 cout <<
"query_transfer time = " ;
310 device_vector<Query::dim> d_dims(dims);
311 Query::dim* d_dims_p = raw_pointer_cast(d_dims.data());
314 cout <<
getInterval(query_start, query_end) <<
"ms." << endl;
316 u64 dataTransferStart, dataTransferEnd;
326 thrust::fill(d_bitmap.begin(), d_bitmap.end(), 0u);
328 u32 * d_bitmap_p = raw_pointer_cast(d_bitmap.data());
331 Logger::log(Logger::INFO,
"[ 30%] Allocating device memory to tables...");
340 d_data.resize(queries.size() * hash_table_size, nulldata);
341 d_data_table = thrust::raw_pointer_cast(d_data.data());
342 d_hash_table =
reinterpret_cast<T_HASHTABLE*
>(d_data_table);
344 Logger::log(Logger::INFO,
"[ 33%] Copying memory to symbol...");
346 cudaCheckErrors(cudaMemcpyToSymbol(d_offsets, h_offsets,
sizeof(
u32)*16, 0, cudaMemcpyHostToDevice));
348 Logger::log(Logger::INFO,
"[ 40%] Starting match kernels...");
349 cudaEventRecord(kernel_start);
351 bool h_overflow[1] = {
false};
358 h_overflow[0] =
false;
359 cudaCheckErrors(cudaMemcpy(d_overflow, h_overflow,
sizeof(
bool), cudaMemcpyHostToDevice));
360 d_threshold.resize(queries.size());
361 thrust::fill(d_threshold.begin(), d_threshold.end(), 1);
362 u32 * d_threshold_p = thrust::raw_pointer_cast(d_threshold.data());
368 d_passCount.resize(queries.size()*num_of_max_count);
369 thrust::fill(d_passCount.begin(), d_passCount.end(), 0u);
370 u32 * d_passCount_p = thrust::raw_pointer_cast(d_passCount.data());
372 device_vector<u32> d_topks;
373 d_topks.resize(queries.size());
374 thrust::fill(d_topks.begin(), d_topks.end(), max_topk);
375 u32 * d_topks_p = thrust::raw_pointer_cast(d_topks.data());
380 table.
i_size() * ((
unsigned int)1<<shift_bits_subsequence),
393 shift_bits_subsequence);
395 cudaCheckErrors(cudaMemcpy(h_overflow, d_overflow,
sizeof(
bool), cudaMemcpyDeviceToHost));
399 hash_table_size += num_of_max_count*max_topk;
400 if(hash_table_size > table.
i_size())
402 hash_table_size = table.
i_size();
404 thrust::fill(d_noiih.begin(), d_noiih.end(), 0u);
407 thrust::fill(d_bitmap.begin(), d_bitmap.end(), 0u);
409 d_data.resize(queries.size()*hash_table_size);
410 thrust::fill(d_data.begin(), d_data.end(), nulldata);
411 d_data_table = thrust::raw_pointer_cast(d_data.data());
412 d_hash_table =
reinterpret_cast<T_HASHTABLE*
>(d_data_table);
415 if (loop_count>1 || (loop_count == 1 && h_overflow[0]))
417 Logger::log(Logger::INFO,
"%d time trying to launch match kernel: %s!", loop_count, h_overflow[0]?
"failed":
"succeeded");
421 }
while (h_overflow[0]);
425 cudaEventRecord(kernel_stop);
426 Logger::log(Logger::INFO,
"[ 90%] Starting data converting......");
428 convert_to_data<<<hash_table_size*queries.size() / 1024 + 1,1024>>>(d_hash_table,(
u32)hash_table_size*queries.size());
430 Logger::log(Logger::INFO,
"[100%] Matching is done!");
434 cudaEventSynchronize(kernel_stop);
435 float kernel_elapsed = 0.0f;
436 cudaEventElapsedTime(&kernel_elapsed, kernel_start, kernel_stop);
437 Logger::log(Logger::INFO,
438 ">>>>[time profiling]: Match kernel takes %f ms. (GPU running) ",
440 Logger::log(Logger::INFO,
441 ">>>>[time profiling]: Match function takes %f ms. (including Match kernel, GPU+CPU part)",
443 Logger::log(Logger::VERBOSE,
">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
447 .QueryCompilationTime(
getInterval(match_query_start, match_query_end))
448 .QueryTransferTime(
getInterval(query_start, query_end))
449 .DataTransferTime(
getInterval(dataTransferStart, dataTransferEnd))
450 .MatchingTime(kernel_elapsed)
451 .InvSize(
sizeof(
int) * table.
inv()->size())
453 .HashTableCapacityPerQuery(hash_table_size)
454 .ThresholdSize(queries.size() *
sizeof(
u32))
455 .PasscountSize(queries.size() * num_of_max_count *
sizeof(
u32))
456 .BitmapSize(bitmap_size *
sizeof(
u32))
457 .NumItemsInHashTableSize(queries.size() *
sizeof(
u32))
458 .TopksSize(queries.size() *
sizeof(
u32))
459 .HashTableSize(queries.size() * hash_table_size *
sizeof(
data_t));
461 }
catch(std::bad_alloc &e){
468 static int build_queries_direct(vector<genie::query::Query> &queries,
genie::table::inv_table &table, vector<genie::query::Query::dim> &dims)
473 for (
size_t i = 0; i < queries.size(); ++i)
475 if (queries[i].ref_table() != &table)
477 int prev_size = dims.size();
479 queries[i].build(dims);
480 int count = dims.size() - prev_size;
481 if (count > max_count)
487 catch (std::bad_alloc &e)
495 catch (std::exception &e)
502 match_MT(vector<inv_table*>& table, vector<vector<Query> >& queries,
503 vector<device_vector<data_t> >& d_data, vector<device_vector<u32> >& d_bitmap,
504 vector<int>& hash_table_size, vector<int>& max_load,
int bitmap_bits,
505 vector<device_vector<u32> >& d_noiih, vector<device_vector<u32> >& d_threshold,
506 vector<device_vector<u32> >& d_passCount,
size_t start,
size_t finish)
511 u64 match_stop, match_start;
512 cudaEvent_t kernel_start, kernel_stop;
513 float kernel_elapsed;
514 cudaEventCreate(&kernel_start);
515 cudaEventCreate(&kernel_stop);
517 Logger::log(Logger::INFO,
"[ 0%] Starting matching...");
520 u32 shift_bits_subsequence = table.at(0)->_shift_bits_subsequence();
521 vector<vector<Query::dim> > dims(table.size());
522 vector<device_vector<Query::dim> > d_dims(table.size());
523 vector<Query::dim*> d_dims_p(table.size());
524 vector<u32*> d_noiih_p(table.size());
525 vector<u32> num_of_max_count(table.size(), 0);
526 vector<u32> max_topk(table.size(), 0);
527 vector<u32*> d_bitmap_p(table.size());
528 vector<bool*> d_overflow(table.size());
529 vector<T_HASHTABLE*> d_hash_table(table.size());
530 vector<u32*> d_threshold_p(table.size());
531 vector<u32*> d_passCount_p(table.size());
532 vector<device_vector<u32> > d_topks(table.size());
533 vector<u32*> d_topks_p(table.size());
534 vector<int> threshold(table.size(), bitmap_bits - 1);
535 vector<int> bitmap_size(table.size(), 0);
536 data_t nulldata = {0u, 0.0f};
542 bitmap_bits = -bitmap_bits;
543 Logger::log(Logger::DEBUG,
544 "[info] bitmap_bits:%d.",
546 int bitmap_bits_copy = bitmap_bits;
549 for (
size_t i = start; i < finish; ++i)
551 if (queries.at(i).empty())
553 if (table.at(i)->build_status() == inv_table::not_builded)
557 bitmap_bits = bitmap_bits_copy;
560 float logresult = std::log2((
float) bitmap_bits);
561 bitmap_bits = (int) logresult;
562 if (logresult - bitmap_bits > 0)
564 logresult = std::log2((
float) bitmap_bits);
565 bitmap_bits = (int) logresult;
566 if (logresult - bitmap_bits > 0)
568 bitmap_bits = pow(2, bitmap_bits);
569 bitmap_size[i] = ((((
unsigned int)1<<shift_bits_subsequence) * table.at(i)->i_size()) / (32 / bitmap_bits) + 1)
570 * queries.at(i).size();
573 bitmap_bits = threshold[i] = 0;
575 Logger::log(Logger::DEBUG,
"[ 20%] Declaring device memory...");
576 d_bitmap.at(i).resize(bitmap_size.at(i));
577 d_bitmap_p[i] = thrust::raw_pointer_cast(d_bitmap.at(i).data());
580 d_noiih.at(i).resize(queries.at(i).size(), 0u);
581 d_noiih_p[i] = thrust::raw_pointer_cast(d_noiih.at(i).data());
582 Logger::log(Logger::DEBUG,
"hash table size: %d.", hash_table_size.at(i));
585 u64 match_query_start, match_query_end;
587 num_of_max_count[i] = build_queries_direct(queries.at(i), table.at(i)[0], dims.at(i));
590 Logger::log(Logger::DEBUG,
591 ">>>>[time profiling]: match: build_queries function takes %f ms. ",
593 Logger::log(Logger::DEBUG,
" dims size: %d.",
598 d_dims[i] = dims.at(i);
600 d_dims_p[i] = raw_pointer_cast(d_dims.at(i).data());
604 Logger::log(Logger::DEBUG,
"[ 30%] Allocating device memory to tables...");
607 d_data.at(i).clear();
608 d_data.at(i).resize(queries.at(i).size() * hash_table_size.at(i), nulldata);
609 d_data_table = thrust::raw_pointer_cast(d_data.at(i).data());
610 d_hash_table[i] =
reinterpret_cast<T_HASHTABLE*
>(d_data_table);
615 cudaCheckErrors(cudaMemcpy(d_overflow[i], &f,
sizeof(
bool), cudaMemcpyHostToDevice));
618 d_threshold.at(i).resize(queries.at(i).size());
619 thrust::fill(d_threshold.at(i).begin(), d_threshold.at(i).end(), 1);
620 d_threshold_p[i] = thrust::raw_pointer_cast(d_threshold.at(i).data());
623 d_passCount.at(i).resize(queries.at(i).size() * num_of_max_count.at(i));
624 thrust::fill(d_passCount.at(i).begin(), d_passCount.at(i).end(), 0u);
625 d_passCount_p[i] = thrust::raw_pointer_cast(d_passCount.at(i).data());
629 d_topks.at(i).resize(queries.at(i).size());
630 thrust::fill(d_topks.at(i).begin(), d_topks.at(i).end(), max_topk[i]);
631 d_topks_p[i] = thrust::raw_pointer_cast(d_topks.at(i).data());
635 Logger::log(Logger::INFO,
"[ 33%] Copying memory to symbol...");
637 cudaCheckErrors(cudaMemcpyToSymbol(d_offsets, h_offsets,
sizeof(
u32)*16, 0, cudaMemcpyHostToDevice));
641 Logger::log(Logger::INFO,
"[ 40%] Starting match kernels...");
642 cudaEventRecord(kernel_start);
643 for (
size_t i = start; i < finish; ++i)
645 if (queries.at(i).empty())
648 table.at(i)->m_size(),
649 table.at(i)->i_size() * ((
unsigned int)1<<shift_bits_subsequence),
650 hash_table_size.at(i),
651 table.at(i)->d_inv_p,
659 num_of_max_count.at(i),
662 shift_bits_subsequence);
667 for (
size_t i = start; i < finish; ++i)
670 cudaEventRecord(kernel_stop);
671 Logger::log(Logger::INFO,
"[ 90%] Starting data converting......");
673 for (
size_t i = start; i < finish; ++i)
674 convert_to_data<<<hash_table_size.at(i) * queries.at(i).size() / 1024 + 1, 1024>>>(d_hash_table.at(i), (
u32)hash_table_size.at(i)*queries.at(i).size());
676 Logger::log(Logger::INFO,
"[100%] Matching is done!");
679 cudaEventSynchronize(kernel_stop);
681 kernel_elapsed = 0.0f;
682 cudaEventElapsedTime(&kernel_elapsed, kernel_start, kernel_stop);
683 Logger::log(Logger::INFO,
684 ">>>>[time profiling]: Match kernel takes %f ms. (GPU running) ",
686 Logger::log(Logger::INFO,
687 ">>>>[time profiling]: Match function takes %f ms. (including Match kernel, GPU+CPU part)",
689 Logger::log(Logger::VERBOSE,
">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
690 }
catch(std::bad_alloc &e){
__device__ __forceinline__ void access_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, bool *key_found, u32 *my_threshold, bool *pass_threshold)
const size_t MATCH_THREADS_PER_BLOCK
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p.
This is the top-level namespace of the project.
int * d_inv_p
d_inv_p points to the start location for posting list array in GPU memory.
__device__ __forceinline__ void hash_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, u32 *my_threshold, u32 *my_noiih, bool *overflow, bool *pass_threshold)
unsigned long long getTime()
Get system time.
The declaration for class inv_table.
void match_MT(vector< inv_table *> &table, vector< vector< Query > > &queries, vector< device_vector< data_t > > &d_data, vector< device_vector< u32 > > &d_bitmap, vector< int > &hash_table_size, vector< int > &max_load, int bitmap_bits, vector< device_vector< u32 > > &d_noiih, vector< device_vector< u32 > > &d_threshold, vector< device_vector< u32 > > &d_passCount, size_t start, size_t finish)
__global__ void convert_to_data(T_HASHTABLE *table, u32 size)
int get_total_num_of_table() const
return the total_num_of_table.
int cal_max_topk(vector< Query > &queries)
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
unsigned int _shift_bits_subsequence()
This file includes interfaces of original GENIE match functions.
virtual std::vector< int > * inv()
Record run-time information.
int build_queries(vector< Query > &queries, inv_table &table, vector< Query::dim > &dims, int max_load)
__device__ __forceinline__ u32 bitmap_kernel_AT(u32 access_id, u32 *bitmap, int bits, int my_threshold, bool *key_eligible)
__global__ void match_AT(int m_size, int i_size, int hash_table_size, int *d_inv, Query::dim *d_dims, T_HASHTABLE *hash_table_list, u32 *bitmap_list, int bitmap_bits, u32 *d_topks, u32 *d_threshold, u32 *d_passCount, u32 num_of_max_count, u32 *noiih, bool *overflow, unsigned int shift_bits_subsequence)
unsigned long long u64
A type definition for a 64-bit unsigned integer.
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
The second-step struct for processing queries.
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end.
Functions about getting system time.
Basic utility functions to be used in matching kernels.
void match(inv_table &table, vector< Query > &queries, device_vector< data_t > &d_data, device_vector< u32 > &d_bitmap, int hash_table_size, int max_load, int bitmap_bits, device_vector< u32 > &d_noiih, device_vector< u32 > &d_threshold, device_vector< u32 > &d_passCount)
__device__ __forceinline__ void updateThreshold(u32 *my_passCount, u32 *my_threshold, u32 my_topk, u32 count)
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.