19 #include <unordered_map>    22 #include <thrust/system_error.h>    23 #include <thrust/copy.h>    24 #include <thrust/device_vector.h>    25 #include <thrust/host_vector.h>    47     namespace genie { 
namespace compression {
    52 void swap(
int * position, 
int offset1, 
int offset2)
    54     int temp = *(position + offset1);
    55     *(position + offset1) = *(position + offset2);
    56     *(position + offset2) = temp;
    60 int partition(
int * id_array, 
int * count_array, 
const int left, 
const int right)
    64     const int pivot = count_array[left];
    69         while(i <= j && count_array[i] > pivot)
    73         while(i <= j && count_array[j] <=pivot)
    79             swap(count_array, i, j);
    85     swap(count_array, left, i-1);
    86     swap(id_array, left, i-1);
    91 void quicksortIdCount(
int * id_array, 
int * count_array, 
const int left, 
const int right)
    93     if(left >= right) 
return;
    95     int mid = 
partition(id_array, count_array, left, right);
   104     cout << 
"print vector 1D. " << endl;
   105     for(
unsigned int i = 0 ; i < toPrint.size() ; ++i)
   107         cout << toPrint[i] << 
",";
   115         std::vector<std::vector<int> >& _result_count, std::vector<int>& result,
   116         std::vector<int>& result_count, 
int table_num, 
int topk, 
int query_num, 
unsigned int iteration_times)
   118     cout << 
"result merge and rank " << endl;
   119     cout << 
"The " << iteration_times << 
"th iteration." << endl;
   121     cout << 
"topk = " << topk << endl;
   122     int individual_topk = topk / iteration_times;
   124     result_count.clear();
   125     for (
int i = 0; i < query_num; ++i)
   128         vector<int> sort_count;
   129         for (
int j = 0; j < table_num; ++j)
   134              sort.insert(sort.end(),
   135                     _result[j].begin() + (i * topk + individual_topk * (iteration_times - 1)),
   136                     _result[j].begin() + (i + 1) * topk);
   137              sort_count.insert(sort_count.end(),
   138                     _result_count[j].begin() + i * topk + individual_topk * (iteration_times - 1),
   139                     _result_count[j].begin() + (i + 1) * topk);
   143         result.insert(result.end(), sort.begin(), sort.begin() + individual_topk);
   144         result_count.insert(result_count.end(), sort_count.begin(), sort_count.begin()+individual_topk);
   148     cout << 
"Merge time = " << 
getInterval(start_merge, end_merge) << 
"ms. "<<endl;
   153     unsigned int cycle = 0;
   176             Logger::log(Logger::DEBUG, 
"build from data_points...");
   199         Logger::log(Logger::DEBUG, 
"build from data_points...");
   200             unsigned int table_num;
   209             cycle = table_num - 2;
   217         for (
unsigned int i = 0; i < cycle; ++i)
   219             vector<vector<int> > temp;
   220             temp.insert(temp.end(),
   242         if (table_num != cycle)
   244             vector<vector<int> > temp1;
   245             vector<vector<int> > temp2;
   246             unsigned int second_last_size = (config.
data_points->size()
   248             temp1.insert(temp1.end(),
   252             temp2.insert(temp2.end(),
   283         inv_table * &_table, std::vector<int>& result,
   284         std::vector<int>& result_count)
   286     std::vector<Query> queries;
   287     vector<vector<int> > _result;
   288     vector<vector<int> > _result_count;
   290     _result.resize(table_num);
   291     _result_count.resize(table_num);
   293     unsigned int accumulate_num = 0;
   294     for (
unsigned int i = 0; i < table_num; ++i)
   299         knn_search(_table[i], queries, _result[i], _result_count[i],config);
   301         if (i <= 0) 
continue;
   302         accumulate_num += _table[i - 1].
i_size();
   303         for (
unsigned int j = 0; j < _result[i].size(); ++j)
   305             _result[i][j] += accumulate_num;
   322     Logger::log(Logger::DEBUG, 
"Data row size: %d. Data Row Number: %d.",
   323             data_points[0].size(), data_points.size());
   326     for (i = 0; i < data_points[0].size(); ++i)
   328         std::vector<int> col;
   329         col.reserve(data_points.size());
   330         for (j = 0; j < data_points.size(); ++j)
   332             col.push_back(data_points[j][i]);
   345     double timeInterval = 
getInterval(starttime, endtime);
   346     Logger::log(Logger::DEBUG,
   347             "Before finishing loading. i_size():%d, m_size():%d.",
   349     Logger::log(Logger::VERBOSE,
   350             ">>>>[time profiling]: loading index takes %f ms<<<<",
   361     unsigned int row_size;
   362     unsigned int index_start_pos = 0;
   366         row_size = index[1] - index[0];
   368     index_start_pos = index[0];
   370     Logger::log(Logger::DEBUG, 
"Data row size: %d. Data Row Number: %d.",
   374     for (i = 0; i < row_size; ++i)
   376         std::vector<int> col;
   377         col.reserve(row_num);
   378         for (j = 0; j < row_num; ++j)
   380             col.push_back(data[index[j] + i - index_start_pos]);
   393     double timeInterval = 
getInterval(starttime, endtime);
   394     Logger::log(Logger::DEBUG,
   395             "Before finishing loading. i_size() : %d, m_size() : %d.",
   397     Logger::log(Logger::VERBOSE,
   398             ">>>>[time profiling]: loading index takes %f ms<<<<",
   431     map<int, Query> query_map;
   443         if (query_map.find(qid) == query_map.end())
   458         query_map[qid].
attr(dim, val, weight, sel, query_map[qid].count_ranges());
   460     for (std::map<int, Query>::iterator it = query_map.begin();
   461             it != query_map.end() && queries.size() < (
unsigned int) config.
num_of_queries;
   464         Query& q = it->second;
   466         queries.push_back(q);
   469     Logger::log(Logger::INFO, 
"Finish loading queries!");
   470     Logger::log(Logger::DEBUG, 
"%d queries are loaded.", queries.size());
   477     Logger::log(Logger::DEBUG, 
"Table dim: %d.", table.
m_size());
   483     std::vector<std::vector<int> >& query_points = *config.
query_points;
   484     for (i = 0; i < query_points.size(); ++i)
   488         for (j = 0; j < query_points[i].size() && (config.
search_type == 1 || j < (
unsigned int) config.
dim); ++j)
   490             value = query_points[i][j];
   493                     value - radius, value + radius,
   508         queries.push_back(q);
   512     double timeInterval = 
getInterval(starttime, endtime);
   513     Logger::log(Logger::DEBUG, 
"%d queries are created!", queries.size());
   514     Logger::log(Logger::VERBOSE,
   515             ">>>>[time profiling]: loading query takes %f ms<<<<",
   523     Logger::log(Logger::DEBUG, 
"Table dim: %d.", table.
m_size());
   527     int value, min_value, max_value;
   531     vector<vector<int> >& query_points = *config.
query_points;
   532     vector<vector<int> > converted_query;
   533     for(i = 0 ; i<query_points.size() ; ++i)
   536         for(j = 0 ; j<query_points[i].size() ; ++j)
   538             if(query_points[i][j] < min_value || query_points[i][j] > max_value)
   540             line.push_back(query_points[i][j] - min_value);
   543         converted_query.push_back(line);
   546     vector<vector<int> > _gram_query, gram_query;
   550     unordered_map<int, int> _map;
   552     for(i = 0; i < _gram_query.size(); ++i)
   555         for(j = 0; j < _gram_query[i].size(); ++j)
   557                 unordered_map<int, int>::iterator result = _map.find(_gram_query[i][j]);
   558                 if(result == _map.end())
   560                     _map.insert({_gram_query[i][j], 0});
   569         gram_query.push_back(line);
   576     for (i = 0; i < gram_query.size(); ++i)
   588         for (j = 0; j < gram_query[i].size() ; ++j)
   590             value = gram_query[i][j];
   605         queries.push_back(q);
   609         cout<<
"query build time = "<<
getInterval(starttime, query_end)<<
"ms."<<endl;
   611     double timeInterval = 
getInterval(starttime, endtime);
   612     Logger::log(Logger::INFO, 
"%d queries are created!", queries.size());
   613     Logger::log(Logger::VERBOSE,
   614             ">>>>[time profiling]: loading query takes %f ms<<<<",
   639     double timeInterval = 
getInterval(starttime, endtime);
   640     Logger::log(Logger::DEBUG,
   641             "Before finishing loading. i_size():%d, m_size():%d.",
   643     Logger::log(Logger::VERBOSE,
   644             ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
   650         unsigned int item_num, 
unsigned int *index, 
unsigned int row_num,
   670     double timeInterval = 
getInterval(starttime, endtime);
   671     Logger::log(Logger::DEBUG,
   672             "Before finishing loading. i_size():%d, m_size():%d.",
   674     Logger::log(Logger::VERBOSE,
   675             ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
   683     int min_value, max_value;
   684     vector<vector<int> > converted_data;
   685     vector<vector<int> > gram_data;
   689     cout << 
"converted data done" << endl;
   716     cout<<
"Start building index"<<endl;
   739     cout<<
"Building table time = "<<
getInterval(tt1, tt2)<<endl;
   746     double timeInterval = 
getInterval(starttime, endtime);
   747     Logger::log(Logger::DEBUG,
   748             "Before finishing loading. i_size():%d, m_size():%d.",
   750     Logger::log(Logger::INFO,
   751             ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
   763     Logger::log(Logger::VERBOSE, 
"Starting preprocessing!");
   766     Logger::log(Logger::VERBOSE, 
"preprocessing finished!");
   775     std::vector<int> result_count;
   787             Logger::log(Logger::INFO, 
"search for csv data!");
   789             cout<<
"knn for csv finished!"<<endl;
   800         Logger::log(Logger::VERBOSE,
   801                 ">>>>[time profiling]: knn_search totally takes %f ms (building query+match+selection)<<<<",
   804     catch (thrust::system::system_error &e){
   805         cout<<
"system_error : "<<e.what()<<endl;
   808         cout<<
"bad_alloc"<<endl;
   811         cout<<
"run time error"<<endl;
   813     } 
catch(std::bad_alloc &e){
   814         cout<<
"cpu bad alloc"<<endl;
   816     } 
catch(std::exception &e){
   817         cout<<
"cpu runtime"<<endl;
   820         cout<<
"other error"<<endl;
   821         std::string msg = 
"Warning: Unknown Exception! Please try again or contact the author.\n";
   827         std::vector<int>& h_topk, std::vector<int>& h_topk_count,
   832     Logger::log(Logger::DEBUG, 
"table.i_size():%d, config.hashtable_size:%f.",
   840     thrust::device_vector<int> d_topk, d_topk_count;
   844     Logger::log(Logger::DEBUG, 
"max_load is %d", max_load);
   848     Logger::log(Logger::INFO, 
"knn search is done!");
   849     Logger::log(Logger::DEBUG, 
"Topk obtained: %d in total.", d_topk.size());
   851     h_topk.resize(d_topk.size());
   852     h_topk_count.resize(d_topk_count.size());
   854     thrust::copy(d_topk.begin(), d_topk.end(), h_topk.begin());
   855     thrust::copy(d_topk_count.begin(), d_topk_count.end(),
   856             h_topk_count.begin());
   860         vector<vector<int> >& h_topk, vector<vector<int> >& h_topk_count, vector<GPUGenie_Config>& configs)
   863     vector<int> hashtable_sizes;
   864     for (
size_t i = 0; i < tables.size(); ++i)
   866         Logger::log(Logger::DEBUG, 
"table.i_size():%d, config.hashtable_size:%f.",
   867                 tables.at(i)->i_size(), configs.at(i).hashtable_size);
   868         if (configs.at(i).hashtable_size <= 2)
   869             hashtable_sizes.push_back(tables.at(i)->i_size() * configs.at(i).hashtable_size + 1);
   871             hashtable_sizes.push_back(configs.at(i).hashtable_size);
   875     vector<int> max_loads;
   876     for (
auto it = configs.begin(); it != configs.end(); ++it)
   878         max_loads.push_back(it->multiplier * it->posting_list_max_length + 1);
   882     vector<thrust::device_vector<int> > d_topk(configs.size()), d_topk_count(configs.size());
   885             queries, d_topk, d_topk_count, hashtable_sizes, max_loads,
   886             configs.at(0).count_threshold); 
   888     Logger::log(Logger::INFO, 
"knn search is done!");
   889     Logger::log(Logger::DEBUG, 
"Topk obtained: %d in total.", d_topk.size());
   892     auto it1 = d_topk.begin();
   893     auto it2 = h_topk.begin();
   895     for (; it1 != d_topk.end(); ++it1, ++it2)
   897         it2->resize(it1->size());
   898         thrust::copy(it1->begin(), it1->end(), it2->begin());
   901     it1 = d_topk_count.begin();
   902     it2 = h_topk_count.begin();
   904     for (; it1 != d_topk_count.end(); ++it1, ++it2)
   906         it2->resize(it1->size());
   907         thrust::copy(it1->begin(), it1->end(), it2->begin());
   918                     vector<int> &resultOffset, 
unsigned int shift_bits)
   920     for(
unsigned int i = 0 ; i < result.size() ; ++i)
   923         rowID = result[i]>>shift_bits;
   924         offset = result[i] - (rowID<<shift_bits);
   925         resultID.push_back(rowID);
   926         resultOffset.push_back(offset);
   931         int max_value, 
int gram_length)
   933     int num_of_value = max_value + 1;
   936     for(
unsigned int i = 0 ; i < sequences.size() ; ++i)
   938         int max_index = sequences[i].size() - gram_length + 1;
   941             vector<int> line_null;
   942             vector<int> temp_line;
   943             for(
unsigned int p = 0; p < (
unsigned int)gram_length; ++p)
   945                 if(p < sequences[i].size() && sequences[i].size() <= 0) 
break;
   946                 if(p < sequences[i].size())
   947                     line_null.push_back(sequences[i][p]);
   949                     line_null.push_back(sequences[i][0]);
   953             for(
unsigned int p = 0; p < (
unsigned int)gram_length; ++p)
   955                  number = number*num_of_value + line_null[p];
   957             temp_line.push_back(number);
   958             gram_data.push_back(temp_line);
   962         for(
unsigned int j = 0 ; j < (
unsigned int)max_index ; ++j)
   965             for(
unsigned int k = 0 ; k < (
unsigned int)gram_length ; ++k )
   967                 gram_value = gram_value*num_of_value + sequences[i][j + k];
   969             line.push_back(gram_value);
   971         gram_data.push_back(line);
   977     min_value = data[0][0];
   978     max_value = min_value;
   979     converted_data.clear();
   980     for(
unsigned int i = 0 ; i < data.size() ; ++i)
   981         for(
unsigned int j = 0 ; j < data[i].size() ; ++j)
   983             if(data[i][j] > max_value) max_value = data[i][j];
   984             if(data[i][j] < min_value) min_value = data[i][j];
   986     for(
unsigned int i = 0 ; i < data.size() ; ++i)
   989         for(
unsigned int j = 0 ; j < data[i].size() ; ++j)
   990             line.push_back(data[i][j] - min_value);
   991         converted_data.push_back(line);
 genie::compression::COMPRESSION_TYPE compression
Config & SetGpuId(const uint8_t gpu_id)
Set the ID of the GPU used. 
std::vector< std::vector< int > > * query_points
void invert_bijectMap(std::vector< std::vector< int > > &vin)
void load_query_singlerange(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void topk(int k)
Set top k matches. 
int get_gram_length_sequence()
Get the gram length. 
void init_genie(GPUGenie_Config &config)
void load_table_bijectMap(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p. 
bool preprocess_for_knn_csv(GPUGenie_Config &config, genie::table::inv_table *&_table)
This is the top-level namespace of the project. 
void set_gram_length_sequence(int gram_length)
Set length of each gram. 
const COMPRESSION_TYPE DEFAULT_COMPRESSION_TYPE
int * d_inv_p
d_inv_p points to the start location for posting list array in GPU memory. 
void knn_search_MT(std::vector< genie::table::inv_table *> &table, std::vector< std::vector< genie::query::Query >> &queries, std::vector< std::vector< int > > &h_topk, std::vector< std::vector< int > > &h_topk_count, std::vector< GPUGenie_Config > &config)
void setCompression(genie::compression::COMPRESSION_TYPE compression)
unsigned long long getTime()
Get system time. 
The declaration for class inv_table. 
int posting_list_max_length
void knn_search_after_preprocess(GPUGenie_Config &config, genie::table::inv_table *&_table, std::vector< int > &result, std::vector< int > &result_count)
void swap(int *position, int offset1, int offset2)
This struct is used to construct query in multirange mode. 
unsigned int max_data_size
void invert_subsequence(std::vector< std::vector< int >> &vin)
void knn_search_for_csv_data(std::vector< int > &result, std::vector< int > &result_count, GPUGenie_Config &config)
void Init(genie::Config &config)
void load_query(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
int get_total_num_of_table() const
return the total_num_of_table. 
void load_table(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
void apply_adaptive_query_range()
Construct query in adaptice range mode. 
void append(inv_list &inv)
Append an inv_list to the inv_table. 
int get_min_value_sequence()
Get the min value for sequences' elements in this inv_table. 
void merge_knn_results_from_multiload(std::vector< std::vector< int > > &_result, std::vector< std::vector< int > > &_result_count, std::vector< int > &result, std::vector< int > &result_count, int table_num, int topk, int query_num, unsigned int iteration_times)
Definitions about configurations that can be set by users. 
virtual void build(size_t max_length, bool use_load_balance)
Build the inv_table. 
void get_rowID_offset(std::vector< int > &result, std::vector< int > &resultID, std::vector< int > &resultOffset, unsigned int shift_bits)
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
void reset_device()
clear gpu memory 
std::vector< genie::utility::attr_t > * multirange_query_points
Collection of knn functions. 
void load_query_sequence(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void knn_bijectMap_MT(std::vector< genie::table::inv_table *> &table, std::vector< std::vector< genie::query::Query > > &queries, std::vector< thrust::device_vector< int > > &d_top_indexes, std::vector< thrust::device_vector< int > > &d_top_count, std::vector< int > &hash_table_size, std::vector< int > &max_load, int bitmap_bits)
int partition(int *id_array, int *count_array, const int left, const int right)
Record run-time information. 
Config class holds all user configurable settings of GENIE. 
void attr(int index, int low, int up, float weight, int order)
Modify the matching range and weight of an attribute. 
unsigned long long u64
A type definition for a 64-bit unsigned integer. 
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
void load_query_multirange(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void knn_search(std::vector< int > &result, std::vector< int > &result_count, GPUGenie_Config &config)
void set_table_index(int attr_index)
Set the table_index to 'index'. 
#define GPUGENIE_DEFAULT_WEIGHT
std::vector< std::vector< int > > * data_points
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end. 
void print1DVector(vector< int > &toPrint)
Functions about getting system time. 
void set_min_value_sequence(int min_value)
Used in sequence search. To set the min_value for all sequences' element. 
void knn(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, thrust::device_vector< int > &d_top_indexes, thrust::device_vector< int > &d_top_count, int hash_table_size, int max_load, int bitmap_bits)
void set_total_num_of_table(int num)
Set the total_num_of_table to 'num'. 
void load_table_sequence(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
void sequence_to_gram(std::vector< std::vector< int > > &sequences, std::vector< std::vector< int > > &gram_data, int max_value, int gram_length)
unsigned int num_of_iteration
bool use_subsequence_search
void setUncompressedPostingListMaxLength(size_t length)
define class inv_compre_table 
void quicksortIdCount(int *id_array, int *count_array, const int left, const int right)
bool use_load_balance
Whether this query set is to use load balance. 
This class manages one inverted list. 
void invert(std::vector< int > &vin)
int get_max_value_sequence()
Get the max value. 
void selectivity(float s)
Set the selectivity. 
void sequence_reduce_to_ground(std::vector< std::vector< int > > &data, std::vector< std::vector< int > > &converted_data, int &min_value, int &max_value)
void set_max_value_sequence(int max_value)
Set the max value for all sequence. Compare to set_min_value_sequence() 
void invert_sequence(std::vector< std::vector< int >> &vin, int shift_bits)
This is used for sequence match. 
void append_sequence(inv_list &inv)
append inv_list for sequence search