10 #include <thrust/copy.h> 11 #include <thrust/device_vector.h> 35 #define GPUGENIE_INTEGRATED_KERNEL_SM_SIZE (1024) 54 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
55 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
58 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
59 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
62 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
63 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
66 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
67 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
69 template void match_integrated<DeviceCompositeCodec<DeviceBitPackingCodec,DeviceCopyCodec>>(
70 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
71 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
73 template void match_integrated<DeviceCompositeCodec<DeviceBitPackingCodec,DeviceVarintCodec>>(
74 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
75 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
77 template void match_integrated<DeviceSerialCodec<DeviceCopyCodec,DeviceCopyCodec>>(
78 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
79 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
81 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceCopyCodec>>(
82 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
83 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
85 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceDeltaCodec>>(
86 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
87 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
89 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceVarintCodec>>(
90 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
91 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
93 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceBitPackingCodec>>(
94 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
95 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
97 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceCompositeCodec<DeviceBitPackingCodec,DeviceCopyCodec>>>(
98 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
99 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
101 template void match_integrated<DeviceSerialCodec<DeviceDeltaCodec,DeviceCompositeCodec<DeviceBitPackingCodec,DeviceVarintCodec>>>(
102 inv_compr_table&, std::vector<Query>&, thrust::device_vector<data_t>&, thrust::device_vector<u32>&,
103 int, int, thrust::device_vector<u32>&, thrust::device_vector<u32>&, thrust::device_vector<u32>&);
106 int getBitmapSize(
int &in_out_bitmap_bits,
u32 in_shift_bits_subsequence,
int in_number_of_data_points,
int in_queries_size)
108 if (in_out_bitmap_bits < 0)
109 in_out_bitmap_bits = -in_out_bitmap_bits;
111 int threshold = in_out_bitmap_bits - 1, bitmap_size = 0;
112 if (in_out_bitmap_bits > 1)
114 float logresult = std::log2((
float) in_out_bitmap_bits);
115 in_out_bitmap_bits = (int) logresult;
116 if (logresult - in_out_bitmap_bits > 0)
118 in_out_bitmap_bits += 1;
120 logresult = std::log2((
float) in_out_bitmap_bits);
121 in_out_bitmap_bits = (int) logresult;
122 if (logresult - in_out_bitmap_bits > 0)
124 in_out_bitmap_bits += 1;
126 in_out_bitmap_bits = pow(2, in_out_bitmap_bits);
127 bitmap_size = ((((
unsigned int)1<<in_shift_bits_subsequence) * in_number_of_data_points) / (32 / in_out_bitmap_bits) + 1)
132 in_out_bitmap_bits = threshold = 0;
135 Logger::log(Logger::DEBUG,
"Bitmap bits: %d, threshold:%d, shift_bits_subsequence: %d",
136 in_out_bitmap_bits, threshold, in_shift_bits_subsequence);
148 for (
unsigned int i = 0; i < queries.size(); ++i)
150 assert (queries[i].ref_table() == ctable);
151 queries[i].build_compressed(max_load);
153 int prev_size = dims.size();
154 queries[i].dump(dims);
156 int count = dims.size() - prev_size;
158 if(count > max_count)
162 Logger::log(Logger::DEBUG,
" dims size: %d.", dims.size());
163 Logger::log(Logger::DEBUG,
"max_count: %d", max_count);
168 template <
class Codec> __global__
void 173 uint32_t* d_compr_inv,
181 u32 num_of_max_count,
184 unsigned int shift_bits_subsequence)
188 assert(m_size != 0 && i_size != 0);
191 int query_index = myb_query.
query;
192 u32* my_noiih = &noiih[query_index];
193 u32* my_threshold = &d_threshold[query_index];
194 u32* my_passCount = &d_passCount[query_index * num_of_max_count];
195 u32 my_topk = d_topks[query_index];
197 T_HASHTABLE* hash_table = &hash_table_list[query_index * hash_table_size];
200 bitmap = &bitmap_list[query_index * (i_size / (32 / bitmap_bits) + 1)];
206 size_t comprLength = max - min;
207 int order = myb_query.
order;
211 assert(max - min <= codec.decodeArrayParallel_maxBlocks() * codec.decodeArrayParallel_lengthPerBlock());
212 assert(max - min <= gridDim.x * blockDim.x * codec.decodeArrayParallel_threadLoad());
213 assert(blockDim.x == codec.decodeArrayParallel_lengthPerBlock() / codec.decodeArrayParallel_threadLoad());
218 int idx = threadIdx.x;
221 for (
int i = 0; i < codec.decodeArrayParallel_lengthPerBlock(); i += codec.decodeArrayParallel_threadsPerBlock())
223 s_comprInv[idx + i] = (idx + i < (int)comprLength) ? d_compr_inv[idx + i + min] : 0;
224 s_decomprInv[idx + i] = 0;
229 codec.decodeArrayParallel(s_comprInv, comprLength, s_decomprInv, decomprLength);
240 assert(decomprLength != 0);
251 if(shift_bits_subsequence != 0)
253 int __offset = access_id & (((
unsigned int)1<<shift_bits_subsequence) - 1);
254 int __new_offset = __offset - order;
255 if(__new_offset >= 0)
257 access_id = access_id - __offset + __new_offset;
263 u32 thread_threshold = *my_threshold;
268 key_eligible =
false;
271 thread_threshold, &key_eligible);
277 key_eligible =
false;
278 if (count < *my_threshold)
287 hash_table, hash_table_size, myb_query, count, &key_eligible,
288 my_threshold, &pass_threshold);
305 if (count < *my_threshold)
310 hash_kernel_AT(access_id, hash_table, hash_table_size, myb_query, count,
311 my_threshold, my_noiih, overflow, &pass_threshold);
330 template <
class Codec>
void 333 std::vector<Query>& queries,
334 thrust::device_vector<data_t>& d_hash_table,
335 thrust::device_vector<u32>& d_bitmap,
338 thrust::device_vector<u32>& d_num_of_items_in_hashtable,
339 thrust::device_vector<u32>& d_threshold,
340 thrust::device_vector<u32>& d_passCount)
343 cudaEvent_t startMatching, stopMatching;
344 cudaEvent_t startConvert, stopConvert;
345 cudaEventCreate(&startMatching);
346 cudaEventCreate(&stopMatching);
347 cudaEventCreate(&startConvert);
348 cudaEventCreate(&stopConvert);
350 u64 overallStart, overallEnd;
351 u64 queryCompilationStart, queryCompilationEnd;
352 u64 preprocessingStart, preprocessingEnd;
353 u64 queryTransferStart, queryTransferEnd;
354 u64 dataTransferStart, dataTransferEnd;
355 u64 constantTransferStart, constantTransferEnd;
356 u64 allocationStart, allocationEnd;
357 u64 fillingStart, fillingEnd;
361 Logger::log(Logger::INFO,
"*** Starting matching (Integrated Compressed)...");
370 Logger::log(Logger::INFO,
" Preprocessing variables for matching kernel...");
371 preprocessingStart =
getTime();
374 int bitmap_size =
getBitmapSize(bitmap_bits, shift_bits_subsequence, table.
i_size(), queries.size());
375 assert(bitmap_size > 0);
382 Logger::log(Logger::INFO,
" Compiling queries...");
383 queryCompilationStart =
getTime();
385 vector<Query::dim> dims;
390 queryCompilationEnd =
getTime();
394 Logger::log(Logger::INFO,
" Transferring queries to device...");
395 queryTransferStart =
getTime();
397 thrust::device_vector<Query::dim> d_dims(dims);
403 Logger::log(Logger::INFO,
" Transferring inverted lists to device...");
413 Logger::log(Logger::INFO,
" Transferring constant symbol memory to device...");
414 constantTransferStart =
getTime();
416 cudaCheckErrors(cudaMemcpyToSymbol(genie::matching::d_offsets, genie::matching::h_offsets,
sizeof(
u32)*16, 0,
417 cudaMemcpyHostToDevice));
418 Logger::log(Logger::INFO,
" Transferring offsets table (total %d bytes)",
sizeof(
u32)*16);
420 constantTransferEnd =
getTime();
424 Logger::log(Logger::INFO,
" Allocating matching memory on device...");
427 Logger::log(Logger::INFO,
" Allocating threshold (total %d bytes)...", queries.size() *
sizeof(
u32));
428 d_threshold.resize(queries.size());
430 Logger::log(Logger::INFO,
" Allocating passCount (total %d bytes)...", queries.size() * num_of_max_count *
sizeof(
u32));
431 d_passCount.resize(queries.size()*num_of_max_count);
433 Logger::log(Logger::INFO,
" Allocating bitmap (total %d bytes)...", bitmap_size *
sizeof(
u32));
434 d_bitmap.resize(bitmap_size);
436 Logger::log(Logger::INFO,
" Allocating num_of_items_in_hashtable (total %d bytes)...", queries.size() *
sizeof(
u32));
437 d_num_of_items_in_hashtable.resize(queries.size());
439 Logger::log(Logger::INFO,
" Allocating d_topks (total %d bytes)...", queries.size() *
sizeof(
u32));
440 thrust::device_vector<u32> d_topks;
441 d_topks.resize(queries.size());
443 Logger::log(Logger::INFO,
" Allocating hash_table (total %d bytes)...", queries.size() * hash_table_size *
sizeof(
data_t));
444 d_hash_table.resize(queries.size() * hash_table_size);
446 bool h_overflow[1] = {
false};
448 Logger::log(Logger::INFO,
" Allocating hash table overflow indicator (total %d bytes)...",
sizeof(
bool));
455 Logger::log(Logger::INFO,
" Matching...");
456 for (
int loop_count = 1; ;loop_count++)
458 Logger::log(Logger::INFO,
" Preparing matching... (attempt %d)", loop_count);
459 Logger::log(Logger::INFO,
" Filling matching memory on device...");
462 thrust::fill(d_threshold.begin(), d_threshold.end(), 1);
463 thrust::fill(d_passCount.begin(), d_passCount.end(), 0u);
464 thrust::fill(d_bitmap.begin(), d_bitmap.end(), 0u);
465 thrust::fill(d_num_of_items_in_hashtable.begin(), d_num_of_items_in_hashtable.end(), 0u);
466 thrust::fill(d_topks.begin(), d_topks.end(), max_topk);
467 thrust::fill(d_hash_table.begin(), d_hash_table.end(),
data_t{0u, 0.0f});
469 h_overflow[0] =
false;
470 cudaCheckErrors(cudaMemcpy(d_overflow_p, h_overflow,
sizeof(
bool), cudaMemcpyHostToDevice));
475 Logger::log(Logger::INFO,
" Starting decompression & match kernel...");
476 cudaEventRecord(startMatching);
483 (table.
i_size() * ((
unsigned int)1<<shift_bits_subsequence)),
486 thrust::raw_pointer_cast(d_dims.data()),
487 reinterpret_cast<T_HASHTABLE*>(thrust::raw_pointer_cast(d_hash_table.data())),
488 thrust::raw_pointer_cast(d_bitmap.data()),
490 thrust::raw_pointer_cast(d_topks.data()),
491 thrust::raw_pointer_cast(d_threshold.data()),
492 thrust::raw_pointer_cast(d_passCount.data()),
494 thrust::raw_pointer_cast(d_num_of_items_in_hashtable.data()),
496 shift_bits_subsequence);
498 cudaEventRecord(stopMatching);
499 cudaEventSynchronize(stopMatching);
502 Logger::log(Logger::INFO,
" Checking for hash table overflow...");
503 cudaCheckErrors(cudaMemcpy(h_overflow, d_overflow_p,
sizeof(
bool), cudaMemcpyDeviceToHost));
505 Logger::log(Logger::INFO,
" Matching succeeded");
509 Logger::log(Logger::INFO,
" Matching failed (hash table overflow)");
510 hash_table_size += num_of_max_count*max_topk;
511 if(hash_table_size > table.
i_size())
512 hash_table_size = table.
i_size();
514 d_hash_table.resize(queries.size()*hash_table_size);
515 Logger::log(Logger::INFO,
" Resized hash table (now total of %d bytes)",
516 queries.size() * hash_table_size *
sizeof(
data_t));
519 Logger::log(Logger::INFO,
" Starting data conversion from hash tables......");
520 Logger::log(Logger::INFO,
" Starting conversion kernel...");
521 cudaEventRecord(startConvert);
524 reinterpret_cast<T_HASHTABLE*
>(thrust::raw_pointer_cast(d_hash_table.data())),
525 (
u32)hash_table_size*queries.size());
527 cudaEventRecord(stopConvert);
532 Logger::log(Logger::INFO,
" Deallocating memory......");
535 Logger::log(Logger::INFO,
" Matching is done!");
539 float matchingTime, convertTime;
540 cudaEventElapsedTime(&matchingTime, startMatching, stopMatching);
541 cudaEventElapsedTime(&convertTime, startConvert, stopConvert);
545 .OverallTime(
getInterval(overallStart, overallEnd))
546 .QueryCompilationTime(
getInterval(queryCompilationStart, queryCompilationEnd))
547 .PreprocessingTime(
getInterval(preprocessingStart, preprocessingEnd))
548 .QueryTransferTime(
getInterval(queryTransferStart, queryTransferEnd))
549 .DataTransferTime(
getInterval(dataTransferStart, dataTransferEnd))
550 .ConstantTransferTime(
getInterval(constantTransferStart, constantTransferEnd))
551 .AllocationTime(
getInterval(allocationStart, allocationEnd))
552 .FillingTime(
getInterval(fillingStart, fillingEnd))
553 .MatchingTime(matchingTime)
554 .ConvertTime(convertTime)
557 .HashTableCapacityPerQuery(hash_table_size)
558 .ThresholdSize(queries.size() *
sizeof(
u32))
559 .PasscountSize(queries.size() * num_of_max_count *
sizeof(
u32))
560 .BitmapSize(bitmap_size *
sizeof(
u32))
561 .NumItemsInHashTableSize(queries.size() *
sizeof(
u32))
562 .TopksSize(queries.size() *
sizeof(
u32))
563 .HashTableSize(queries.size() * hash_table_size *
sizeof(
data_t));
__device__ __forceinline__ void access_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, bool *key_found, u32 *my_threshold, bool *pass_threshold)
const size_t MATCH_THREADS_PER_BLOCK
template void match_integrated< DeviceBitPackingCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
#define GPUGENIE_INTEGRATED_KERNEL_SM_SIZE
void match_integrated(genie::table::inv_compr_table &table, std::vector< genie::query::Query > &queries, thrust::device_vector< genie::matching::data_t > &d_data, thrust::device_vector< u32 > &d_bitmap, int hash_table_size, int bitmap_bits, thrust::device_vector< u32 > &d_noiih, thrust::device_vector< u32 > &d_threshold, thrust::device_vector< u32 > &d_passCount)
This is the top-level namespace of the project.
__device__ __forceinline__ void hash_kernel_AT(u32 id, T_HASHTABLE *htable, int hash_table_size, genie::query::Query::dim &q, u32 count, u32 *my_threshold, u32 *my_noiih, bool *overflow, bool *pass_threshold)
unsigned long long getTime()
Get system time.
template void match_integrated< DeviceCopyCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
std::vector< uint32_t > * compressedInv()
genie::compression::COMPRESSION_TYPE getCompression() const
__global__ void convert_to_data(T_HASHTABLE *table, u32 size)
int build_compressed_queries(vector< Query > &queries, inv_compr_table *ctable, vector< Query::dim > &dims, int max_load)
int get_total_num_of_table() const
return the total_num_of_table.
__global__ void match_adaptiveThreshold_integrated(int m_size, int i_size, int hash_table_size, uint32_t *d_compr_inv, Query::dim *d_dims, T_HASHTABLE *hash_table_list, u32 *bitmap_list, int bitmap_bits, u32 *d_topks, u32 *d_threshold, u32 *d_passCount, u32 num_of_max_count, u32 *noiih, bool *overflow, unsigned int shift_bits_subsequence)
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p.
int cal_max_topk(vector< Query > &queries)
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
template void match_integrated< DeviceDeltaCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
unsigned int _shift_bits_subsequence()
size_t getUncompressedPostingListMaxLength() const
Record run-time information.
__device__ __forceinline__ u32 bitmap_kernel_AT(u32 access_id, u32 *bitmap, int bits, int my_threshold, bool *key_eligible)
unsigned long long u64
A type definition for a 64-bit unsigned integer.
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
The second-step struct for processing queries.
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end.
Functions about getting system time.
template void match_integrated< DeviceVarintCodec >(inv_compr_table &, std::vector< Query > &, thrust::device_vector< data_t > &, thrust::device_vector< u32 > &, int, int, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &, thrust::device_vector< u32 > &)
Basic utility functions to be used in matching kernels.
uint32_t * deviceCompressedInv() const
__device__ __forceinline__ void updateThreshold(u32 *my_passCount, u32 *my_threshold, u32 my_topk, u32 count)
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.
int getBitmapSize(int &in_out_bitmap_bits, u32 in_shift_bits_subsequence, int in_number_of_data_points, int in_queries_size)