8 #include <thrust/copy.h> 9 #include <thrust/device_vector.h> 10 #include <thrust/host_vector.h> 11 #include <thrust/count.h> 17 #define THREADS_PER_BLOCK 256 18 #define GPUGenie_Minus_One_THREADS_PER_BLOCK 1024 28 int tid = threadIdx.x;
29 int index = threadIdx.x + blockDim.x * blockIdx.x;
30 data_t * my_data = data + data_size * blockIdx.x;
31 u32 my_threshold = thresholds[blockIdx.x];
32 for (
int i = 0; i * blockDim.x + tid < data_size; ++i)
34 int id = i * blockDim.x + tid;
35 if (my_data[
id].aggregation > my_threshold)
45 int tid = threadIdx.x;
46 int * my_buffer = buffer + 2 * blockIdx.x * size;
47 int * my_data = data + blockIdx.x * size;
48 int * my_output = output + blockIdx.x * (size + 1);
49 my_buffer[size + tid] = my_buffer[tid] = tid == 0 ? 0 : my_data[tid - 1];
53 for (
int offset = 1; offset < size; offset *= 2)
56 my_buffer[tid] += my_buffer[size + tid - offset];
58 my_buffer[size + tid] = my_buffer[tid];
61 my_output[tid] = my_buffer[tid];
64 my_output[tid + 1] = my_output[tid] + my_data[tid];
71 int data_size,
int topk_size)
73 int tid = threadIdx.x;
74 data_t * my_data = data + data_size * blockIdx.x;
75 u32 my_threshold = thresholds[blockIdx.x];
76 data_t * my_topk = topk + blockIdx.x * topk_size
77 + indices[blockIdx.x * (blockDim.x + 1) + tid];
78 for (
int i = 0; i * blockDim.x + tid < data_size; ++i)
80 int id = i * blockDim.x + tid;
81 if (my_data[
id].aggregation > my_threshold)
83 *my_topk = my_data[id];
89 __shared__
int index[1];
90 my_topk = topk + blockIdx.x * topk_size;
92 *index = indices[blockIdx.x * (blockDim.x + 1) + blockDim.x];
95 if (*index >= topk_size)
97 for (
int i = 0; i * blockDim.x + tid < data_size; ++i)
99 if (*index >= topk_size)
101 int id = i * blockDim.x + tid;
102 if (
int(my_data[
id].aggregation) == my_threshold)
104 int old_index = atomicAdd(index, 1);
105 if (old_index >= topk_size)
111 my_topk[old_index] = my_data[id];
120 int tId = threadIdx.x + blockIdx.x * blockDim.x;
123 if(thresholds[tId] > max_count){
124 thresholds[tId] = max_count - 1;
125 }
else if (thresholds[tId] != 0)
133 int part_size = d_data.size() / num_of_queries;
134 thrust::host_vector<data_t> h_data(d_data);
136 FILE * fout_compact = NULL;
139 char fout_compact_name[100];
140 sprintf(fout_name,
"%s.txt", s.c_str());
141 sprintf(fout_compact_name,
"%s-compact.txt", s.c_str());
142 fout = fopen(fout_name,
"w");
143 fout_compact = fopen(fout_compact_name,
"w");
144 for(
int qi = 0; qi < num_of_queries; ++qi){
145 fprintf(fout,
"Query %d:\n", qi);
146 fprintf(fout_compact,
"Query %d:\n", qi);
147 for(
int di = 0; di < part_size; ++di){
148 int id = qi * part_size + di;
149 if(h_data[
id].aggregation != 0.0f || h_data[
id].
id != 0){
150 fprintf(fout_compact,
"[%d] %d\n", h_data[
id].
id,
int(h_data[
id].aggregation));
152 fprintf(fout,
"[%d] %d\n", h_data[
id].
id,
int(h_data[
id].aggregation));
155 fprintf(fout_compact,
"\n");
158 fclose(fout_compact);
162 thrust::device_vector<data_t>& d_topk,
163 thrust::device_vector<u32>& d_threshold,
164 thrust::device_vector<u32>& d_passCount,
int topk,
int num_of_queries)
166 int data_size = d_data.size() / num_of_queries;
169 int max_count = d_passCount.size() / num_of_queries;
170 int * d_num_over_threshold_p;
175 cudaMalloc((
void** ) &d_num_over_threshold_p,
176 sizeof(
int) * threads * num_of_queries));
178 cudaMemset((
void* ) d_num_over_threshold_p, 0,
179 sizeof(
int) * threads * num_of_queries));
193 d_threshold_p = thrust::raw_pointer_cast(d_threshold.data());
196 d_threshold.size(), max_count);
206 d_data_p = thrust::raw_pointer_cast(d_data.data());
208 count_over_threshold<<<num_of_queries, threads>>>(d_data_p,
209 d_num_over_threshold_p, d_threshold_p, data_size);
227 int * d_buffer, *d_scan_indices;
228 cudaCheckErrors(cudaMalloc((
void**) &d_buffer, 2 *
sizeof(
int) * threads * num_of_queries));
229 cudaCheckErrors(cudaMemset((
void*) d_buffer, 0, 2 *
sizeof(
int) * threads * num_of_queries));
231 sizeof(
int) * (threads + 1) * num_of_queries));
233 exclusive_scan<<<num_of_queries, threads>>>(d_num_over_threshold_p,
234 d_buffer, d_scan_indices, threads);
253 d_topk.resize(topk * num_of_queries);
254 data_t empty_data ={ 0, 0 };
255 thrust::fill(d_topk.begin(), d_topk.end(), empty_data);
256 data_t * d_topk_p = thrust::raw_pointer_cast(d_topk.data());
258 fill_in_scan<<<num_of_queries, threads, 2 * sizeof(int)>>>(d_data_p,
259 d_threshold_p, d_scan_indices, d_topk_p, data_size, topk);
__global__ void transform_threshold(u32 *thresholds, int size, int max_count)
__global__ void exclusive_scan(int *data, int *buffer, int *output, int size)
#define GPUGenie_Minus_One_THREADS_PER_BLOCK
std::string currentDateTime()
Get current data time.
This file includes interfaces of original GENIE match functions.
__global__ void fill_in_scan(data_t *data, u32 *thresholds, int *indices, data_t *topk, int data_size, int topk_size)
void heap_count_topk(thrust::device_vector< genie::matching::data_t > &d_data, thrust::device_vector< genie::matching::data_t > &d_topk, thrust::device_vector< u32 > &d_threshold, thrust::device_vector< u32 > &d_passCount, int topk, int num_of_queries)
This file implements the function for topk selection in the final hashtable.
void write_hashtable_to_file(thrust::device_vector< data_t > &d_data, int num_of_queries)
Functions about getting system time.
__global__ void count_over_threshold(data_t *data, int *result, u32 *thresholds, int data_size)
#define cudaCheckErrors(err)
The wrapper function to validate CUDA calls.
#define THREADS_PER_BLOCK