19 #include <unordered_map> 22 #include <thrust/system_error.h> 23 #include <thrust/copy.h> 24 #include <thrust/device_vector.h> 25 #include <thrust/host_vector.h> 47 namespace genie {
namespace compression {
52 void swap(
int * position,
int offset1,
int offset2)
54 int temp = *(position + offset1);
55 *(position + offset1) = *(position + offset2);
56 *(position + offset2) = temp;
60 int partition(
int * id_array,
int * count_array,
const int left,
const int right)
64 const int pivot = count_array[left];
69 while(i <= j && count_array[i] > pivot)
73 while(i <= j && count_array[j] <=pivot)
79 swap(count_array, i, j);
85 swap(count_array, left, i-1);
86 swap(id_array, left, i-1);
91 void quicksortIdCount(
int * id_array,
int * count_array,
const int left,
const int right)
93 if(left >= right)
return;
95 int mid =
partition(id_array, count_array, left, right);
104 cout <<
"print vector 1D. " << endl;
105 for(
unsigned int i = 0 ; i < toPrint.size() ; ++i)
107 cout << toPrint[i] <<
",";
115 std::vector<std::vector<int> >& _result_count, std::vector<int>& result,
116 std::vector<int>& result_count,
int table_num,
int topk,
int query_num,
unsigned int iteration_times)
118 cout <<
"result merge and rank " << endl;
119 cout <<
"The " << iteration_times <<
"th iteration." << endl;
121 cout <<
"topk = " << topk << endl;
122 int individual_topk = topk / iteration_times;
124 result_count.clear();
125 for (
int i = 0; i < query_num; ++i)
128 vector<int> sort_count;
129 for (
int j = 0; j < table_num; ++j)
134 sort.insert(sort.end(),
135 _result[j].begin() + (i * topk + individual_topk * (iteration_times - 1)),
136 _result[j].begin() + (i + 1) * topk);
137 sort_count.insert(sort_count.end(),
138 _result_count[j].begin() + i * topk + individual_topk * (iteration_times - 1),
139 _result_count[j].begin() + (i + 1) * topk);
143 result.insert(result.end(), sort.begin(), sort.begin() + individual_topk);
144 result_count.insert(result_count.end(), sort_count.begin(), sort_count.begin()+individual_topk);
148 cout <<
"Merge time = " <<
getInterval(start_merge, end_merge) <<
"ms. "<<endl;
153 unsigned int cycle = 0;
176 Logger::log(Logger::DEBUG,
"build from data_points...");
199 Logger::log(Logger::DEBUG,
"build from data_points...");
200 unsigned int table_num;
209 cycle = table_num - 2;
217 for (
unsigned int i = 0; i < cycle; ++i)
219 vector<vector<int> > temp;
220 temp.insert(temp.end(),
242 if (table_num != cycle)
244 vector<vector<int> > temp1;
245 vector<vector<int> > temp2;
246 unsigned int second_last_size = (config.
data_points->size()
248 temp1.insert(temp1.end(),
252 temp2.insert(temp2.end(),
283 inv_table * &_table, std::vector<int>& result,
284 std::vector<int>& result_count)
286 std::vector<Query> queries;
287 vector<vector<int> > _result;
288 vector<vector<int> > _result_count;
290 _result.resize(table_num);
291 _result_count.resize(table_num);
293 unsigned int accumulate_num = 0;
294 for (
unsigned int i = 0; i < table_num; ++i)
299 knn_search(_table[i], queries, _result[i], _result_count[i],config);
301 if (i <= 0)
continue;
302 accumulate_num += _table[i - 1].
i_size();
303 for (
unsigned int j = 0; j < _result[i].size(); ++j)
305 _result[i][j] += accumulate_num;
322 Logger::log(Logger::DEBUG,
"Data row size: %d. Data Row Number: %d.",
323 data_points[0].size(), data_points.size());
326 for (i = 0; i < data_points[0].size(); ++i)
328 std::vector<int> col;
329 col.reserve(data_points.size());
330 for (j = 0; j < data_points.size(); ++j)
332 col.push_back(data_points[j][i]);
345 double timeInterval =
getInterval(starttime, endtime);
346 Logger::log(Logger::DEBUG,
347 "Before finishing loading. i_size():%d, m_size():%d.",
349 Logger::log(Logger::VERBOSE,
350 ">>>>[time profiling]: loading index takes %f ms<<<<",
361 unsigned int row_size;
362 unsigned int index_start_pos = 0;
366 row_size = index[1] - index[0];
368 index_start_pos = index[0];
370 Logger::log(Logger::DEBUG,
"Data row size: %d. Data Row Number: %d.",
374 for (i = 0; i < row_size; ++i)
376 std::vector<int> col;
377 col.reserve(row_num);
378 for (j = 0; j < row_num; ++j)
380 col.push_back(data[index[j] + i - index_start_pos]);
393 double timeInterval =
getInterval(starttime, endtime);
394 Logger::log(Logger::DEBUG,
395 "Before finishing loading. i_size() : %d, m_size() : %d.",
397 Logger::log(Logger::VERBOSE,
398 ">>>>[time profiling]: loading index takes %f ms<<<<",
431 map<int, Query> query_map;
443 if (query_map.find(qid) == query_map.end())
458 query_map[qid].
attr(dim, val, weight, sel, query_map[qid].count_ranges());
460 for (std::map<int, Query>::iterator it = query_map.begin();
461 it != query_map.end() && queries.size() < (
unsigned int) config.
num_of_queries;
464 Query& q = it->second;
466 queries.push_back(q);
469 Logger::log(Logger::INFO,
"Finish loading queries!");
470 Logger::log(Logger::DEBUG,
"%d queries are loaded.", queries.size());
477 Logger::log(Logger::DEBUG,
"Table dim: %d.", table.
m_size());
483 std::vector<std::vector<int> >& query_points = *config.
query_points;
484 for (i = 0; i < query_points.size(); ++i)
488 for (j = 0; j < query_points[i].size() && (config.
search_type == 1 || j < (
unsigned int) config.
dim); ++j)
490 value = query_points[i][j];
493 value - radius, value + radius,
508 queries.push_back(q);
512 double timeInterval =
getInterval(starttime, endtime);
513 Logger::log(Logger::DEBUG,
"%d queries are created!", queries.size());
514 Logger::log(Logger::VERBOSE,
515 ">>>>[time profiling]: loading query takes %f ms<<<<",
523 Logger::log(Logger::DEBUG,
"Table dim: %d.", table.
m_size());
527 int value, min_value, max_value;
531 vector<vector<int> >& query_points = *config.
query_points;
532 vector<vector<int> > converted_query;
533 for(i = 0 ; i<query_points.size() ; ++i)
536 for(j = 0 ; j<query_points[i].size() ; ++j)
538 if(query_points[i][j] < min_value || query_points[i][j] > max_value)
540 line.push_back(query_points[i][j] - min_value);
543 converted_query.push_back(line);
546 vector<vector<int> > _gram_query, gram_query;
550 unordered_map<int, int> _map;
552 for(i = 0; i < _gram_query.size(); ++i)
555 for(j = 0; j < _gram_query[i].size(); ++j)
557 unordered_map<int, int>::iterator result = _map.find(_gram_query[i][j]);
558 if(result == _map.end())
560 _map.insert({_gram_query[i][j], 0});
569 gram_query.push_back(line);
576 for (i = 0; i < gram_query.size(); ++i)
588 for (j = 0; j < gram_query[i].size() ; ++j)
590 value = gram_query[i][j];
605 queries.push_back(q);
609 cout<<
"query build time = "<<
getInterval(starttime, query_end)<<
"ms."<<endl;
611 double timeInterval =
getInterval(starttime, endtime);
612 Logger::log(Logger::INFO,
"%d queries are created!", queries.size());
613 Logger::log(Logger::VERBOSE,
614 ">>>>[time profiling]: loading query takes %f ms<<<<",
639 double timeInterval =
getInterval(starttime, endtime);
640 Logger::log(Logger::DEBUG,
641 "Before finishing loading. i_size():%d, m_size():%d.",
643 Logger::log(Logger::VERBOSE,
644 ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
650 unsigned int item_num,
unsigned int *index,
unsigned int row_num,
670 double timeInterval =
getInterval(starttime, endtime);
671 Logger::log(Logger::DEBUG,
672 "Before finishing loading. i_size():%d, m_size():%d.",
674 Logger::log(Logger::VERBOSE,
675 ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
683 int min_value, max_value;
684 vector<vector<int> > converted_data;
685 vector<vector<int> > gram_data;
689 cout <<
"converted data done" << endl;
716 cout<<
"Start building index"<<endl;
739 cout<<
"Building table time = "<<
getInterval(tt1, tt2)<<endl;
746 double timeInterval =
getInterval(starttime, endtime);
747 Logger::log(Logger::DEBUG,
748 "Before finishing loading. i_size():%d, m_size():%d.",
750 Logger::log(Logger::INFO,
751 ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
763 Logger::log(Logger::VERBOSE,
"Starting preprocessing!");
766 Logger::log(Logger::VERBOSE,
"preprocessing finished!");
775 std::vector<int> result_count;
787 Logger::log(Logger::INFO,
"search for csv data!");
789 cout<<
"knn for csv finished!"<<endl;
800 Logger::log(Logger::VERBOSE,
801 ">>>>[time profiling]: knn_search totally takes %f ms (building query+match+selection)<<<<",
804 catch (thrust::system::system_error &e){
805 cout<<
"system_error : "<<e.what()<<endl;
808 cout<<
"bad_alloc"<<endl;
811 cout<<
"run time error"<<endl;
813 }
catch(std::bad_alloc &e){
814 cout<<
"cpu bad alloc"<<endl;
816 }
catch(std::exception &e){
817 cout<<
"cpu runtime"<<endl;
820 cout<<
"other error"<<endl;
821 std::string msg =
"Warning: Unknown Exception! Please try again or contact the author.\n";
827 std::vector<int>& h_topk, std::vector<int>& h_topk_count,
832 Logger::log(Logger::DEBUG,
"table.i_size():%d, config.hashtable_size:%f.",
840 thrust::device_vector<int> d_topk, d_topk_count;
844 Logger::log(Logger::DEBUG,
"max_load is %d", max_load);
848 Logger::log(Logger::INFO,
"knn search is done!");
849 Logger::log(Logger::DEBUG,
"Topk obtained: %d in total.", d_topk.size());
851 h_topk.resize(d_topk.size());
852 h_topk_count.resize(d_topk_count.size());
854 thrust::copy(d_topk.begin(), d_topk.end(), h_topk.begin());
855 thrust::copy(d_topk_count.begin(), d_topk_count.end(),
856 h_topk_count.begin());
860 vector<vector<int> >& h_topk, vector<vector<int> >& h_topk_count, vector<GPUGenie_Config>& configs)
863 vector<int> hashtable_sizes;
864 for (
size_t i = 0; i < tables.size(); ++i)
866 Logger::log(Logger::DEBUG,
"table.i_size():%d, config.hashtable_size:%f.",
867 tables.at(i)->i_size(), configs.at(i).hashtable_size);
868 if (configs.at(i).hashtable_size <= 2)
869 hashtable_sizes.push_back(tables.at(i)->i_size() * configs.at(i).hashtable_size + 1);
871 hashtable_sizes.push_back(configs.at(i).hashtable_size);
875 vector<int> max_loads;
876 for (
auto it = configs.begin(); it != configs.end(); ++it)
878 max_loads.push_back(it->multiplier * it->posting_list_max_length + 1);
882 vector<thrust::device_vector<int> > d_topk(configs.size()), d_topk_count(configs.size());
885 queries, d_topk, d_topk_count, hashtable_sizes, max_loads,
886 configs.at(0).count_threshold);
888 Logger::log(Logger::INFO,
"knn search is done!");
889 Logger::log(Logger::DEBUG,
"Topk obtained: %d in total.", d_topk.size());
892 auto it1 = d_topk.begin();
893 auto it2 = h_topk.begin();
895 for (; it1 != d_topk.end(); ++it1, ++it2)
897 it2->resize(it1->size());
898 thrust::copy(it1->begin(), it1->end(), it2->begin());
901 it1 = d_topk_count.begin();
902 it2 = h_topk_count.begin();
904 for (; it1 != d_topk_count.end(); ++it1, ++it2)
906 it2->resize(it1->size());
907 thrust::copy(it1->begin(), it1->end(), it2->begin());
918 vector<int> &resultOffset,
unsigned int shift_bits)
920 for(
unsigned int i = 0 ; i < result.size() ; ++i)
923 rowID = result[i]>>shift_bits;
924 offset = result[i] - (rowID<<shift_bits);
925 resultID.push_back(rowID);
926 resultOffset.push_back(offset);
931 int max_value,
int gram_length)
933 int num_of_value = max_value + 1;
936 for(
unsigned int i = 0 ; i < sequences.size() ; ++i)
938 int max_index = sequences[i].size() - gram_length + 1;
941 vector<int> line_null;
942 vector<int> temp_line;
943 for(
unsigned int p = 0; p < (
unsigned int)gram_length; ++p)
945 if(p < sequences[i].size() && sequences[i].size() <= 0)
break;
946 if(p < sequences[i].size())
947 line_null.push_back(sequences[i][p]);
949 line_null.push_back(sequences[i][0]);
953 for(
unsigned int p = 0; p < (
unsigned int)gram_length; ++p)
955 number = number*num_of_value + line_null[p];
957 temp_line.push_back(number);
958 gram_data.push_back(temp_line);
962 for(
unsigned int j = 0 ; j < (
unsigned int)max_index ; ++j)
965 for(
unsigned int k = 0 ; k < (
unsigned int)gram_length ; ++k )
967 gram_value = gram_value*num_of_value + sequences[i][j + k];
969 line.push_back(gram_value);
971 gram_data.push_back(line);
977 min_value = data[0][0];
978 max_value = min_value;
979 converted_data.clear();
980 for(
unsigned int i = 0 ; i < data.size() ; ++i)
981 for(
unsigned int j = 0 ; j < data[i].size() ; ++j)
983 if(data[i][j] > max_value) max_value = data[i][j];
984 if(data[i][j] < min_value) min_value = data[i][j];
986 for(
unsigned int i = 0 ; i < data.size() ; ++i)
989 for(
unsigned int j = 0 ; j < data[i].size() ; ++j)
990 line.push_back(data[i][j] - min_value);
991 converted_data.push_back(line);
genie::compression::COMPRESSION_TYPE compression
Config & SetGpuId(const uint8_t gpu_id)
Set the ID of the GPU used.
std::vector< std::vector< int > > * query_points
void invert_bijectMap(std::vector< std::vector< int > > &vin)
void load_query_singlerange(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void topk(int k)
Set top k matches.
int get_gram_length_sequence()
Get the gram length.
void init_genie(GPUGenie_Config &config)
void load_table_bijectMap(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p.
bool preprocess_for_knn_csv(GPUGenie_Config &config, genie::table::inv_table *&_table)
This is the top-level namespace of the project.
void set_gram_length_sequence(int gram_length)
Set length of each gram.
const COMPRESSION_TYPE DEFAULT_COMPRESSION_TYPE
int * d_inv_p
d_inv_p points to the start location for posting list array in GPU memory.
void knn_search_MT(std::vector< genie::table::inv_table *> &table, std::vector< std::vector< genie::query::Query >> &queries, std::vector< std::vector< int > > &h_topk, std::vector< std::vector< int > > &h_topk_count, std::vector< GPUGenie_Config > &config)
void setCompression(genie::compression::COMPRESSION_TYPE compression)
unsigned long long getTime()
Get system time.
The declaration for class inv_table.
int posting_list_max_length
void knn_search_after_preprocess(GPUGenie_Config &config, genie::table::inv_table *&_table, std::vector< int > &result, std::vector< int > &result_count)
void swap(int *position, int offset1, int offset2)
This struct is used to construct query in multirange mode.
unsigned int max_data_size
void invert_subsequence(std::vector< std::vector< int >> &vin)
void knn_search_for_csv_data(std::vector< int > &result, std::vector< int > &result_count, GPUGenie_Config &config)
void Init(genie::Config &config)
void load_query(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
int get_total_num_of_table() const
return the total_num_of_table.
void load_table(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
void apply_adaptive_query_range()
Construct query in adaptice range mode.
void append(inv_list &inv)
Append an inv_list to the inv_table.
int get_min_value_sequence()
Get the min value for sequences' elements in this inv_table.
void merge_knn_results_from_multiload(std::vector< std::vector< int > > &_result, std::vector< std::vector< int > > &_result_count, std::vector< int > &result, std::vector< int > &result_count, int table_num, int topk, int query_num, unsigned int iteration_times)
Definitions about configurations that can be set by users.
virtual void build(size_t max_length, bool use_load_balance)
Build the inv_table.
void get_rowID_offset(std::vector< int > &result, std::vector< int > &resultID, std::vector< int > &resultOffset, unsigned int shift_bits)
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
void reset_device()
clear gpu memory
std::vector< genie::utility::attr_t > * multirange_query_points
Collection of knn functions.
void load_query_sequence(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void knn_bijectMap_MT(std::vector< genie::table::inv_table *> &table, std::vector< std::vector< genie::query::Query > > &queries, std::vector< thrust::device_vector< int > > &d_top_indexes, std::vector< thrust::device_vector< int > > &d_top_count, std::vector< int > &hash_table_size, std::vector< int > &max_load, int bitmap_bits)
int partition(int *id_array, int *count_array, const int left, const int right)
Record run-time information.
Config class holds all user configurable settings of GENIE.
void attr(int index, int low, int up, float weight, int order)
Modify the matching range and weight of an attribute.
unsigned long long u64
A type definition for a 64-bit unsigned integer.
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
void load_query_multirange(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void knn_search(std::vector< int > &result, std::vector< int > &result_count, GPUGenie_Config &config)
void set_table_index(int attr_index)
Set the table_index to 'index'.
#define GPUGENIE_DEFAULT_WEIGHT
std::vector< std::vector< int > > * data_points
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end.
void print1DVector(vector< int > &toPrint)
Functions about getting system time.
void set_min_value_sequence(int min_value)
Used in sequence search. To set the min_value for all sequences' element.
void knn(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, thrust::device_vector< int > &d_top_indexes, thrust::device_vector< int > &d_top_count, int hash_table_size, int max_load, int bitmap_bits)
void set_total_num_of_table(int num)
Set the total_num_of_table to 'num'.
void load_table_sequence(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
void sequence_to_gram(std::vector< std::vector< int > > &sequences, std::vector< std::vector< int > > &gram_data, int max_value, int gram_length)
unsigned int num_of_iteration
bool use_subsequence_search
void setUncompressedPostingListMaxLength(size_t length)
define class inv_compre_table
void quicksortIdCount(int *id_array, int *count_array, const int left, const int right)
bool use_load_balance
Whether this query set is to use load balance.
This class manages one inverted list.
void invert(std::vector< int > &vin)
int get_max_value_sequence()
Get the max value.
void selectivity(float s)
Set the selectivity.
void sequence_reduce_to_ground(std::vector< std::vector< int > > &data, std::vector< std::vector< int > > &converted_data, int &min_value, int &max_value)
void set_max_value_sequence(int max_value)
Set the max value for all sequence. Compare to set_min_value_sequence()
void invert_sequence(std::vector< std::vector< int >> &vin, int shift_bits)
This is used for sequence match.
void append_sequence(inv_list &inv)
append inv_list for sequence search