GENIE
interface.cc
Go to the documentation of this file.
1 
6 #include <stdio.h>
7 #include <stdlib.h>
8 
9 #include <iostream>
10 #include <sstream>
11 #include <fstream>
12 
13 #include <string>
14 #include <sys/time.h>
15 #include <ctime>
16 #include <map>
17 #include <vector>
18 #include <algorithm>
19 #include <unordered_map>
20 #include <list>
21 
22 #include <thrust/system_error.h>
23 #include <thrust/copy.h>
24 #include <thrust/device_vector.h>
25 #include <thrust/host_vector.h>
26 
27 #include <genie/configure.h>
28 #ifdef GENIE_COMPR
30 #endif
31 #include <genie/matching/knn.h>
33 #include <genie/utility/init.h>
34 #include <genie/utility/Logger.h>
35 #include <genie/utility/Timing.h>
36 
37 #include "interface.h"
38 
39 using namespace genie::compression;
40 using namespace genie::table;
41 using namespace genie::original;
42 using namespace genie::query;
43 using namespace genie::utility;
44 using namespace std;
45 
46 #ifndef GENIE_COMPR
47  namespace genie { namespace compression {
49  }}
50 #endif
51 
52 void swap(int * position, int offset1, int offset2)
53 {
54  int temp = *(position + offset1);
55  *(position + offset1) = *(position + offset2);
56  *(position + offset2) = temp;
57 }
58 
59 
60 int partition(int * id_array, int * count_array, const int left, const int right)// right is inclusive
61 
62 {
63 
64  const int pivot = count_array[left];
65  int i = left + 1;
66  int j = right;
67  while(i <= j)
68  {
69  while(i <= j && count_array[i] > pivot)
70  {
71  i++;
72  }
73  while(i <= j && count_array[j] <=pivot)
74  {
75  j--;
76  }
77  if(i < j)
78  {
79  swap(count_array, i, j);
80  swap(id_array, i, j);
81  }
82 
83  }
84 
85  swap(count_array, left, i-1);
86  swap(id_array, left, i-1);
87  return i-1;
88 
89 }
90 
91 void quicksortIdCount(int * id_array, int * count_array, const int left, const int right)//right is inclusive
92 {
93  if(left >= right) return;
94 
95  int mid = partition(id_array, count_array, left, right);
96 
97  quicksortIdCount(id_array, count_array, left, mid-1);
98  quicksortIdCount(id_array, count_array, mid + 1, right);
99 }
100 
101 
102 void print1DVector(vector<int> & toPrint)
103 {
104  cout << "print vector 1D. " << endl;
105  for(unsigned int i = 0 ; i < toPrint.size() ; ++i)
106  {
107  cout << toPrint[i] << ",";
108  }
109  cout << endl;
110 }
111 
112 
113 //merge result
114 void merge_knn_results_from_multiload(std::vector<std::vector<int> >& _result,
115  std::vector<std::vector<int> >& _result_count, std::vector<int>& result,
116  std::vector<int>& result_count, int table_num, int topk, int query_num, unsigned int iteration_times)
117 {
118  cout << "result merge and rank " << endl;
119  cout << "The " << iteration_times << "th iteration." << endl;
120  u64 start_merge = getTime();
121  cout << "topk = " << topk << endl;
122  int individual_topk = topk / iteration_times;
123  result.clear();
124  result_count.clear();
125  for (int i = 0; i < query_num; ++i)
126  {
127  vector<int> sort;
128  vector<int> sort_count;
129  for (int j = 0; j < table_num; ++j)
130  {
131  //print1DVector(_result[j]);
132  //print1DVector(_result_count[j]);
133 
134  sort.insert(sort.end(),
135  _result[j].begin() + (i * topk + individual_topk * (iteration_times - 1)),
136  _result[j].begin() + (i + 1) * topk);
137  sort_count.insert(sort_count.end(),
138  _result_count[j].begin() + i * topk + individual_topk * (iteration_times - 1),
139  _result_count[j].begin() + (i + 1) * topk);
140  }
141 
142  quicksortIdCount(&sort[0], &sort_count[0], 0, sort.size()-1);
143  result.insert(result.end(), sort.begin(), sort.begin() + individual_topk);
144  result_count.insert(result_count.end(), sort_count.begin(), sort_count.begin()+individual_topk);
145 
146  }
147  u64 end_merge = getTime();
148  cout << "Merge time = " << getInterval(start_merge, end_merge) << "ms. "<<endl;
149 }
151  inv_table * &_table)
152 {
153  unsigned int cycle = 0;
154 
155  config.num_of_topk = config.num_of_iteration * config.num_of_topk;
156 
157 
158  if (config.max_data_size >= config.data_points->size() || config.max_data_size <= 0)
159  {
160  if (config.data_points->size() > 0)
161  {
162  if (config.compression == NO_COMPRESSION)
163  _table = new inv_table[1];
164  #ifdef GENIE_COMPR
165  else
166  {
167  inv_compr_table * comprTable = new inv_compr_table[1];
168  comprTable[0].setCompression(config.compression);
170  _table = comprTable;
171  }
172  #endif
173 
174  _table[0].set_table_index(0);
175  _table[0].set_total_num_of_table(1);
176  Logger::log(Logger::DEBUG, "build from data_points...");
177  switch (config.search_type)
178  {
179  case 0:
180  load_table(_table[0], *(config.data_points), config);
181  break;
182  case 1:
183  load_table_bijectMap(_table[0], *(config.data_points), config);
184  break;
185  case 2:
186  load_table_sequence(_table[0], *(config.data_points), config);
187  break;
188  default:
189  throw genie::exception::cpu_runtime_error("Unrecognised search type!");
190  }
191  }
192  else
193  {
194  throw genie::exception::cpu_runtime_error("no data input!");
195  }
196  }
197  else
198  {
199  Logger::log(Logger::DEBUG, "build from data_points...");
200  unsigned int table_num;
201  if (config.data_points->size() % config.max_data_size == 0)
202  {
203  table_num = config.data_points->size() / config.max_data_size;
204  cycle = table_num;
205  }
206  else
207  {
208  table_num = config.data_points->size() / config.max_data_size + 1;
209  cycle = table_num - 2;
210  }
211 
212  if (config.compression == NO_COMPRESSION)
213  _table = new inv_table[table_num];
214  else
215  throw new genie::exception::cpu_runtime_error("Compression for multiple tables to yet supported");
216 
217  for (unsigned int i = 0; i < cycle; ++i)
218  {
219  vector<vector<int> > temp;
220  temp.insert(temp.end(),
221  config.data_points->begin() + i * config.max_data_size,
222  config.data_points->begin()
223  + (i + 1) * config.max_data_size);
224 
225  _table[i].set_table_index(i);
226  _table[i].set_total_num_of_table(table_num);
227  switch (config.search_type)
228  {
229  case 0:
230  load_table(_table[i], temp, config);
231  break;
232  case 1:
233  load_table_bijectMap(_table[i], temp, config);
234  break;
235  case 2:
236  load_table_sequence(_table[i], temp, config);
237  break;
238  default:
239  throw genie::exception::cpu_runtime_error("Unrecognised search type!");
240  }
241  }
242  if (table_num != cycle)
243  {
244  vector<vector<int> > temp1;
245  vector<vector<int> > temp2;
246  unsigned int second_last_size = (config.data_points->size()
247  - cycle * config.max_data_size) / 2;
248  temp1.insert(temp1.end(),
249  config.data_points->begin() + cycle * config.max_data_size,
250  config.data_points->begin() + cycle * config.max_data_size
251  + second_last_size);
252  temp2.insert(temp2.end(),
253  config.data_points->begin() + cycle * config.max_data_size
254  + second_last_size, config.data_points->end());
255 
256  _table[cycle].set_table_index(cycle);
257  _table[cycle].set_total_num_of_table(table_num);
258  _table[cycle + 1].set_table_index(cycle+1);
259  _table[cycle + 1].set_total_num_of_table(table_num);
260  switch (config.search_type)
261  {
262  case 0:
263  load_table(_table[cycle], temp1, config);
264  load_table(_table[cycle + 1], temp2, config);
265  break;
266  case 1:
267  load_table_bijectMap(_table[cycle], temp1, config);
268  load_table_bijectMap(_table[cycle + 1], temp2, config);
269  break;
270  case 2:
271  load_table_sequence(_table[cycle], temp1, config);
272  load_table_sequence(_table[cycle + 1], temp2,config);
273  break;
274  default:
275  throw genie::exception::cpu_runtime_error("Unrecognised search type!");
276  }
277  }
278  }
279  return true;
280 }
281 
283  inv_table * &_table, std::vector<int>& result,
284  std::vector<int>& result_count)
285 {
286  std::vector<Query> queries;
287  vector<vector<int> > _result;
288  vector<vector<int> > _result_count;
289  unsigned int table_num = _table[0].get_total_num_of_table();
290  _result.resize(table_num);
291  _result_count.resize(table_num);
292 
293  unsigned int accumulate_num = 0;
294  for (unsigned int i = 0; i < table_num; ++i)
295  {
296  queries.clear();
297  load_query(_table[i], queries, config);
298 
299  knn_search(_table[i], queries, _result[i], _result_count[i],config);
300 
301  if (i <= 0) continue;
302  accumulate_num += _table[i - 1].i_size();
303  for (unsigned int j = 0; j < _result[i].size(); ++j)
304  {
305  _result[i][j] += accumulate_num;
306  }
307  }
308 
309 
310 
311  merge_knn_results_from_multiload(_result, _result_count, result,
312  result_count, table_num, config.num_of_topk, queries.size(), config.num_of_iteration);
313 
314 
315 }
317  std::vector<std::vector<int> >& data_points, GPUGenie_Config& config)
318 {
319  inv_list list;
320  u32 i, j;
321 
322  Logger::log(Logger::DEBUG, "Data row size: %d. Data Row Number: %d.",
323  data_points[0].size(), data_points.size());
324  u64 starttime = getTime();
325 
326  for (i = 0; i < data_points[0].size(); ++i)
327  {
328  std::vector<int> col;
329  col.reserve(data_points.size());
330  for (j = 0; j < data_points.size(); ++j)
331  {
332  col.push_back(data_points[j][i]);
333  }
334  list.invert(col);
335  table.append(list);
336  }
337 
338  table.build(config.posting_list_max_length, config.use_load_balance);
339 
340  if (config.save_to_gpu && table.d_inv_p == NULL)
341  table.cpy_data_to_gpu();
342  table.is_stored_in_gpu = config.save_to_gpu;
343 
344  u64 endtime = getTime();
345  double timeInterval = getInterval(starttime, endtime);
346  Logger::log(Logger::DEBUG,
347  "Before finishing loading. i_size():%d, m_size():%d.",
348  table.i_size(), table.m_size());
349  Logger::log(Logger::VERBOSE,
350  ">>>>[time profiling]: loading index takes %f ms<<<<",
351  timeInterval);
352 
353 }
354 
355 void genie::original::load_table(inv_table& table, int *data, unsigned int item_num,
356  unsigned int *index, unsigned int row_num, GPUGenie_Config& config)
357 {
358  inv_list list;
359  u32 i, j;
360 
361  unsigned int row_size;
362  unsigned int index_start_pos = 0;
363  if (row_num == 1)
364  row_size = item_num;
365  else
366  row_size = index[1] - index[0];
367 
368  index_start_pos = index[0];
369 
370  Logger::log(Logger::DEBUG, "Data row size: %d. Data Row Number: %d.",
371  index[1], row_num);
372  u64 starttime = getTime();
373 
374  for (i = 0; i < row_size; ++i)
375  {
376  std::vector<int> col;
377  col.reserve(row_num);
378  for (j = 0; j < row_num; ++j)
379  {
380  col.push_back(data[index[j] + i - index_start_pos]);
381  }
382  list.invert(col);
383  table.append(list);
384  }
385 
386  table.build(config.posting_list_max_length, config.use_load_balance);
387 
388  if (config.save_to_gpu && table.d_inv_p == NULL)
389  table.cpy_data_to_gpu();
390  table.is_stored_in_gpu = config.save_to_gpu;
391 
392  u64 endtime = getTime();
393  double timeInterval = getInterval(starttime, endtime);
394  Logger::log(Logger::DEBUG,
395  "Before finishing loading. i_size() : %d, m_size() : %d.",
396  table.i_size(), table.m_size());
397  Logger::log(Logger::VERBOSE,
398  ">>>>[time profiling]: loading index takes %f ms<<<<",
399  timeInterval);
400 
401 }
402 
403 void genie::original::load_query(inv_table& table, std::vector<Query>& queries,
404  GPUGenie_Config& config)
405 {
406  if(config.search_type == 2)
407  {
408  load_query_sequence(table, queries, config);
409  return;
410  }
411  if (config.use_multirange)
412  {
413  load_query_multirange(table, queries, config);
414  }
415  else
416  {
417  load_query_singlerange(table, queries, config);
418  }
419 }
420 
421 //Read new format query data
422 //Sample data format
423 //qid dim value selectivity weight
424 // 0 0 15 0.04 1
425 // 0 1 6 0.04 1
426 // ....
428  std::vector<Query>& queries, GPUGenie_Config& config)
429 {
430  queries.clear();
431  map<int, Query> query_map;
432  int qid, dim, val;
433  float sel, weight;
434  for (unsigned int iq = 0; iq < config.multirange_query_points->size(); ++iq)
435  {
436  attr_t& attr = (*config.multirange_query_points)[iq];
437 
438  qid = attr.qid;
439  dim = attr.dim;
440  val = attr.value;
441  weight = attr.weight;
442  sel = attr.sel;
443  if (query_map.find(qid) == query_map.end())
444  {
445  Query q(table, qid);
446  q.topk(config.num_of_topk);
447  if (config.selectivity > 0.0f)
448  {
449  q.selectivity(config.selectivity);
450  }
451  if (config.use_load_balance)
452  {
453  q.use_load_balance = true;
454  }
455  query_map[qid] = q;
456 
457  }
458  query_map[qid].attr(dim, val, weight, sel, query_map[qid].count_ranges());
459  }
460  for (std::map<int, Query>::iterator it = query_map.begin();
461  it != query_map.end() && queries.size() < (unsigned int) config.num_of_queries;
462  ++it)
463  {
464  Query& q = it->second;
466  queries.push_back(q);
467  }
468 
469  Logger::log(Logger::INFO, "Finish loading queries!");
470  Logger::log(Logger::DEBUG, "%d queries are loaded.", queries.size());
471 
472 }
474  std::vector<Query>& queries, GPUGenie_Config& config)
475 {
476 
477  Logger::log(Logger::DEBUG, "Table dim: %d.", table.m_size());
478  u64 starttime = getTime();
479 
480  u32 i, j;
481  int value;
482  int radius = config.query_radius;
483  std::vector<std::vector<int> >& query_points = *config.query_points;
484  for (i = 0; i < query_points.size(); ++i)
485  {
486  Query q(table, i);
487 
488  for (j = 0; j < query_points[i].size() && (config.search_type == 1 || j < (unsigned int) config.dim); ++j)
489  {
490  value = query_points[i][j];
491 
492  q.attr(config.search_type == 1 ? 0 : j,
493  value - radius, value + radius,
495  }
496 
497  q.topk(config.num_of_topk);
498  q.selectivity(config.selectivity);
499  if (config.use_adaptive_range)
500  {
502  }
503  if (config.use_load_balance)
504  {
505  q.use_load_balance = true;
506  }
507 
508  queries.push_back(q);
509  }
510 
511  u64 endtime = getTime();
512  double timeInterval = getInterval(starttime, endtime);
513  Logger::log(Logger::DEBUG, "%d queries are created!", queries.size());
514  Logger::log(Logger::VERBOSE,
515  ">>>>[time profiling]: loading query takes %f ms<<<<",
516  timeInterval);
517 }
518 
520  vector<Query>& queries, GPUGenie_Config& config)
521 {
522 
523  Logger::log(Logger::DEBUG, "Table dim: %d.", table.m_size());
524  u64 starttime = getTime();
525 
526  u32 i, j;
527  int value, min_value, max_value;
528  min_value = table.get_min_value_sequence();
529  max_value = table.get_max_value_sequence();
530 
531  vector<vector<int> >& query_points = *config.query_points;
532  vector<vector<int> > converted_query;
533  for(i = 0 ; i<query_points.size() ; ++i)
534  {
535  vector<int> line;
536  for(j = 0 ; j<query_points[i].size() ; ++j)
537  {
538  if(query_points[i][j] < min_value || query_points[i][j] > max_value)
539  continue;
540  line.push_back(query_points[i][j] - min_value);
541  }
542 
543  converted_query.push_back(line);
544  }
545 
546  vector<vector<int> > _gram_query, gram_query;
547  sequence_to_gram(converted_query, _gram_query, table.get_max_value_sequence() - min_value, table.get_gram_length_sequence());
548 
549 
550  unordered_map<int, int> _map;
551 
552  for(i = 0; i < _gram_query.size(); ++i)
553  {
554  vector<int> line;
555  for(j = 0; j < _gram_query[i].size(); ++j)
556  {
557  unordered_map<int, int>::iterator result = _map.find(_gram_query[i][j]);
558  if(result == _map.end())
559  {
560  _map.insert({_gram_query[i][j], 0});
561  line.push_back(_gram_query[i][j]<<table.shift_bits_sequence);// should be 6 here
562  }
563  else if(result->second < (int)pow(2, table.shift_bits_sequence) -1)
564  {
565  result->second += 1;
566  line.push_back((result->first<<table.shift_bits_sequence) + result->second);
567  }
568  }
569  gram_query.push_back(line);
570  _map.clear();
571  }
572 
573  u64 query_start = getTime();
574 
575 
576  for (i = 0; i < gram_query.size(); ++i)
577  {
578  Query q(table, i);
579 
580  /*
581  int min_bound,max_bound;
582  min_bound = (int)gram_query[i].size() - ((int)gram_query[i].size()+2)*config.edit_distance_diff;//(L-2) - L*diff, L is the original length
583  max_bound = (int)gram_query[i].size() + ((int)gram_query[i].size()+2)*config.edit_distance_diff + 1; //(L-2) + L*diff, exclusive
584 
585  if(min_bound < 0) min_bound = 0;
586  if(max_bound > table.m_size()) max_bound = table.m_size();
587  */
588  for (j = 0; j < gram_query[i].size() ; ++j)
589  {
590  value = gram_query[i][j];
591  if (value < 0)
592  {
593  continue;
594  }
595 
596  q.attr(0, value, value, GPUGENIE_DEFAULT_WEIGHT, 0);
597  }
598 
599  q.topk(config.num_of_topk);
600  if (config.use_load_balance)
601  {
602  q.use_load_balance = true;
603  }
604 
605  queries.push_back(q);
606  }
607  u64 query_end = getTime();
608 
609  cout<<"query build time = "<<getInterval(starttime, query_end)<<"ms."<<endl;
610  u64 endtime = getTime();
611  double timeInterval = getInterval(starttime, endtime);
612  Logger::log(Logger::INFO, "%d queries are created!", queries.size());
613  Logger::log(Logger::VERBOSE,
614  ">>>>[time profiling]: loading query takes %f ms<<<<",
615  timeInterval);
616 }
617 
618 
619 
620 
622  std::vector<std::vector<int> >& data_points, GPUGenie_Config& config)
623 {
624  u64 starttime = getTime();
625 
626  inv_list list;
627  if(config.use_subsequence_search)
628  list.invert_subsequence(data_points);
629  else
630  list.invert_bijectMap(data_points);
631  table.append(list);
632  table.build(config.posting_list_max_length, config.use_load_balance);
633 
634  if (config.save_to_gpu && table.d_inv_p == NULL)
635  table.cpy_data_to_gpu();
636  table.is_stored_in_gpu = config.save_to_gpu;
637 
638  u64 endtime = getTime();
639  double timeInterval = getInterval(starttime, endtime);
640  Logger::log(Logger::DEBUG,
641  "Before finishing loading. i_size():%d, m_size():%d.",
642  table.i_size(), table.m_size());
643  Logger::log(Logger::VERBOSE,
644  ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
645  timeInterval);
646 
647 }
648 
650  unsigned int item_num, unsigned int *index, unsigned int row_num,
651  GPUGenie_Config& config)
652 {
653 
654  u64 starttime = getTime();
655 
656  inv_list list;
657  if(config.use_subsequence_search)
658  list.invert_subsequence(data, item_num, index, row_num);
659  else
660  list.invert_bijectMap(data, item_num, index, row_num);
661 
662  table.append(list);
663  table.build(config.posting_list_max_length, config.use_load_balance);
664 
665  if (config.save_to_gpu && table.d_inv_p == NULL)
666  table.cpy_data_to_gpu();
667  table.is_stored_in_gpu = config.save_to_gpu;
668 
669  u64 endtime = getTime();
670  double timeInterval = getInterval(starttime, endtime);
671  Logger::log(Logger::DEBUG,
672  "Before finishing loading. i_size():%d, m_size():%d.",
673  table.i_size(), table.m_size());
674  Logger::log(Logger::VERBOSE,
675  ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
676  timeInterval);
677 
678 }
679 
680 void genie::original::load_table_sequence(inv_table& table, vector<vector<int> >& data_points, GPUGenie_Config& config)
681 {
682  u64 starttime = getTime();
683  int min_value, max_value;
684  vector<vector<int> > converted_data;
685  vector<vector<int> > gram_data;
686  sequence_reduce_to_ground(data_points, converted_data ,min_value ,max_value);
687 
688 
689  cout << "converted data done" << endl;
690 
691 
692  table.set_min_value_sequence(min_value);
693  table.set_max_value_sequence(max_value);
694  sequence_to_gram(converted_data, gram_data, max_value - min_value ,config.data_gram_length);
696 
697  table.shift_bits_sequence = 6;
698 
699  /*
700  vector<vector<int> > length_id;//th 1st element, length is 1
701  vector<inv_list> lists;
702  for(unsigned int i = 0; i < gram_data.size(); ++i)
703  {
704  if(gram_data[i].size() <= 0)
705  continue;
706  if(length_id.size() < gram_data[i].size())
707  length_id.resize(gram_data[i].size());
708  length_id[gram_data[i].size()-1].push_back(i);
709 
710  }
711 
712  lists.resize(length_id.size());
713  */
714  //config.dim = length_id.size();
715  config.dim = 1;
716  cout<<"Start building index"<<endl;
717  u64 tt1 = getTime();
718  /*
719  for(unsigned int i = 0; i < length_id.size(); ++i)
720  {
721  vector<vector<int> > temp_set;
722  vector<int> respective_id;
723  respective_id = length_id[i];
724  for(unsigned int j = 0; j < length_id[i].size(); ++j)
725  temp_set.push_back(gram_data[length_id[i][j]]);
726 
727  inv_list list;
728  list.invert_sequence(temp_set, table.shift_bits_sequence, respective_id);
729  table.append_sequence(list);
730  }
731  */
732 
733  inv_list list;
734  list.invert_sequence(gram_data, table.shift_bits_sequence);//For now, the number of bits is set to 6 internally.
735  table.append_sequence(list);
736  table.build(config.posting_list_max_length, config.use_load_balance);
737  u64 tt2 = getTime();
738 
739  cout<<"Building table time = "<<getInterval(tt1, tt2)<<endl;
740 
741  if(config.save_to_gpu && table.d_inv_p == NULL)
742  table.cpy_data_to_gpu();
743  table.is_stored_in_gpu = config.save_to_gpu;
744 
745  u64 endtime = getTime();
746  double timeInterval = getInterval(starttime, endtime);
747  Logger::log(Logger::DEBUG,
748  "Before finishing loading. i_size():%d, m_size():%d.",
749  table.i_size(), table.m_size());
750  Logger::log(Logger::INFO,
751  ">>>>[time profiling]: loading index takes %f ms (for one dim multi-values)<<<<",
752  timeInterval);
753 
754 
755 
756 }
757 
758 void genie::original::knn_search_for_csv_data(std::vector<int>& result,
759  std::vector<int>& result_count, GPUGenie_Config& config)
760 {
761  inv_table *_table = NULL;
762 
763  Logger::log(Logger::VERBOSE, "Starting preprocessing!");
764  preprocess_for_knn_csv(config, _table);
765 
766  Logger::log(Logger::VERBOSE, "preprocessing finished!");
767 
768  knn_search_after_preprocess(config, _table, result, result_count);
769 
770  delete[] _table;
771 }
772 
773 void genie::original::knn_search(std::vector<int>& result, GPUGenie_Config& config)
774 {
775  std::vector<int> result_count;
776  knn_search(result, result_count, config);
777 }
778 
779 void genie::original::knn_search(std::vector<int>& result,
780  std::vector<int>& result_count, GPUGenie_Config& config)
781 {
782  try{
783  u64 starttime = getTime();
784  switch (config.data_type)
785  {
786  case 0:
787  Logger::log(Logger::INFO, "search for csv data!");
788  knn_search_for_csv_data(result, result_count, config);
789  cout<<"knn for csv finished!"<<endl;
790  break;
791  case 1:
792  throw genie::exception::cpu_runtime_error("Binary .dat data_type no longer supported\n");
793  default:
794  throw genie::exception::cpu_runtime_error("Please check data type in config\n");
795  }
796 
797  u64 endtime = getTime();
798  double elapsed = getInterval(starttime, endtime);
799 
800  Logger::log(Logger::VERBOSE,
801  ">>>>[time profiling]: knn_search totally takes %f ms (building query+match+selection)<<<<",
802  elapsed);
803  }
804  catch (thrust::system::system_error &e){
805  cout<<"system_error : "<<e.what()<<endl;
806  throw genie::exception::gpu_runtime_error(e.what());
807  } catch (genie::exception::gpu_bad_alloc &e){
808  cout<<"bad_alloc"<<endl;
809  throw e;
811  cout<<"run time error"<<endl;
812  throw e;
813  } catch(std::bad_alloc &e){
814  cout<<"cpu bad alloc"<<endl;
815  throw genie::exception::cpu_bad_alloc(e.what());
816  } catch(std::exception &e){
817  cout<<"cpu runtime"<<endl;
818  throw genie::exception::cpu_runtime_error(e.what());
819  } catch(...){
820  cout<<"other error"<<endl;
821  std::string msg = "Warning: Unknown Exception! Please try again or contact the author.\n";
822  throw genie::exception::cpu_runtime_error(msg.c_str());
823  }
824 }
825 
826 void genie::original::knn_search(inv_table& table, std::vector<Query>& queries,
827  std::vector<int>& h_topk, std::vector<int>& h_topk_count,
828  GPUGenie_Config& config)
829 {
830  int hashtable_size;
831 
832  Logger::log(Logger::DEBUG, "table.i_size():%d, config.hashtable_size:%f.",
833  table.i_size(), config.hashtable_size);
834 
835  if (config.hashtable_size <= 2)
836  hashtable_size = table.i_size() * config.hashtable_size + 1;
837  else
838  hashtable_size = config.hashtable_size;
839 
840  thrust::device_vector<int> d_topk, d_topk_count;
841 
842  int max_load = config.multiplier * config.posting_list_max_length + 1;
843 
844  Logger::log(Logger::DEBUG, "max_load is %d", max_load);
845 
846  genie::matching::knn(table, queries, d_topk, d_topk_count, hashtable_size, max_load, config.count_threshold);
847 
848  Logger::log(Logger::INFO, "knn search is done!");
849  Logger::log(Logger::DEBUG, "Topk obtained: %d in total.", d_topk.size());
850 
851  h_topk.resize(d_topk.size());
852  h_topk_count.resize(d_topk_count.size());
853 
854  thrust::copy(d_topk.begin(), d_topk.end(), h_topk.begin());
855  thrust::copy(d_topk_count.begin(), d_topk_count.end(),
856  h_topk_count.begin());
857 }
858 
859 void genie::original::knn_search_MT(vector<inv_table*>& tables, vector<vector<Query> >& queries,
860  vector<vector<int> >& h_topk, vector<vector<int> >& h_topk_count, vector<GPUGenie_Config>& configs)
861 {
862  /* hashtable size */
863  vector<int> hashtable_sizes;
864  for (size_t i = 0; i < tables.size(); ++i)
865  {
866  Logger::log(Logger::DEBUG, "table.i_size():%d, config.hashtable_size:%f.",
867  tables.at(i)->i_size(), configs.at(i).hashtable_size);
868  if (configs.at(i).hashtable_size <= 2)
869  hashtable_sizes.push_back(tables.at(i)->i_size() * configs.at(i).hashtable_size + 1);
870  else
871  hashtable_sizes.push_back(configs.at(i).hashtable_size);
872  }
873 
874  /* max load */
875  vector<int> max_loads;
876  for (auto it = configs.begin(); it != configs.end(); ++it)
877  {
878  max_loads.push_back(it->multiplier * it->posting_list_max_length + 1);
879  //Logger::log(Logger::DEBUG, "max_load is %d", max_loads);
880  }
881 
882  vector<thrust::device_vector<int> > d_topk(configs.size()), d_topk_count(configs.size());
884  tables, //basic API, since encode dimension and value is also finally transformed as a bijection map
885  queries, d_topk, d_topk_count, hashtable_sizes, max_loads,
886  configs.at(0).count_threshold); // threshold is the same
887 
888  Logger::log(Logger::INFO, "knn search is done!");
889  Logger::log(Logger::DEBUG, "Topk obtained: %d in total.", d_topk.size());
890 
891  /* copy result */
892  auto it1 = d_topk.begin();
893  auto it2 = h_topk.begin();
894 //#pragma omp parallel for private(it1, it2)
895  for (; it1 != d_topk.end(); ++it1, ++it2)
896  {
897  it2->resize(it1->size());
898  thrust::copy(it1->begin(), it1->end(), it2->begin());
899  }
900 
901  it1 = d_topk_count.begin();
902  it2 = h_topk_count.begin();
903 //#pragma omp parallel for private(it1, it2)
904  for (; it1 != d_topk_count.end(); ++it1, ++it2)
905  {
906  it2->resize(it1->size());
907  thrust::copy(it1->begin(), it1->end(), it2->begin());
908  }
909 }
910 
911 
913 {
914  cudaDeviceReset();
915 }
916 
917 void genie::original::get_rowID_offset(vector<int> &result, vector<int> &resultID,
918  vector<int> &resultOffset, unsigned int shift_bits)
919 {
920  for(unsigned int i = 0 ; i < result.size() ; ++i)
921  {
922  int rowID, offset;
923  rowID = result[i]>>shift_bits;
924  offset = result[i] - (rowID<<shift_bits);
925  resultID.push_back(rowID);
926  resultOffset.push_back(offset);
927  }
928 }
929 
930 void genie::original::sequence_to_gram(vector<vector<int> > & sequences, vector<vector<int> >& gram_data,
931  int max_value, int gram_length)
932 {
933  int num_of_value = max_value + 1;
934  int gram_value = 0;
935 
936  for(unsigned int i = 0 ; i < sequences.size() ; ++i)
937  {
938  int max_index = sequences[i].size() - gram_length + 1;
939  if(max_index <= 0)
940  {
941  vector<int> line_null;
942  vector<int> temp_line;
943  for(unsigned int p = 0; p < (unsigned int)gram_length; ++p)
944  {
945  if(p < sequences[i].size() && sequences[i].size() <= 0) break;
946  if(p < sequences[i].size())
947  line_null.push_back(sequences[i][p]);
948  else
949  line_null.push_back(sequences[i][0]);
950 
951  }
952  int number = 0;
953  for(unsigned int p = 0; p < (unsigned int)gram_length; ++p)
954  {
955  number = number*num_of_value + line_null[p];
956  }
957  temp_line.push_back(number);
958  gram_data.push_back(temp_line);
959  continue;
960  }
961  vector<int> line;
962  for(unsigned int j = 0 ; j < (unsigned int)max_index ; ++j)
963  {
964  gram_value = 0;
965  for(unsigned int k = 0 ; k < (unsigned int)gram_length ; ++k )
966  {
967  gram_value = gram_value*num_of_value + sequences[i][j + k];
968  }
969  line.push_back(gram_value);
970  }
971  gram_data.push_back(line);
972  }
973 }
974 
975 void genie::original::sequence_reduce_to_ground(vector<vector<int> > & data, vector<vector<int> > & converted_data ,int& min_value ,int &max_value)
976 {
977  min_value = data[0][0];
978  max_value = min_value;
979  converted_data.clear();
980  for(unsigned int i = 0 ; i < data.size() ; ++i)
981  for(unsigned int j = 0 ; j < data[i].size() ; ++j)
982  {
983  if(data[i][j] > max_value) max_value = data[i][j];
984  if(data[i][j] < min_value) min_value = data[i][j];
985  }
986  for(unsigned int i = 0 ; i < data.size() ; ++i)
987  {
988  vector<int> line;
989  for(unsigned int j = 0 ; j < data[i].size() ; ++j)
990  line.push_back(data[i][j] - min_value);
991  converted_data.push_back(line);
992 
993  }
994 }
995 
997 {
998  genie::Config config = Config().SetGpuId(old_config.use_device);
999  genie::utility::Init(config);
1000 }
1001 
genie::compression::COMPRESSION_TYPE compression
Definition: interface.h:108
Config & SetGpuId(const uint8_t gpu_id)
Set the ID of the GPU used.
Definition: config.cc:62
std::vector< std::vector< int > > * query_points
Definition: interface.h:93
void invert_bijectMap(std::vector< std::vector< int > > &vin)
void load_query_singlerange(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
Definition: interface.cc:473
void topk(int k)
Set top k matches.
Definition: query.cc:263
int get_gram_length_sequence()
Get the gram length.
Definition: inv_table.cu:385
void init_genie(GPUGenie_Config &config)
Definition: interface.cc:996
void load_table_bijectMap(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
Definition: interface.cc:621
bool cpy_data_to_gpu()
Copy vector _inv to gpu memory which is referenced by d_inv_p.
Definition: inv_table.cu:30
bool preprocess_for_knn_csv(GPUGenie_Config &config, genie::table::inv_table *&_table)
Definition: interface.cc:150
This is the top-level namespace of the project.
void set_gram_length_sequence(int gram_length)
Set length of each gram.
Definition: inv_table.cu:378
const COMPRESSION_TYPE DEFAULT_COMPRESSION_TYPE
Definition: interface.cc:48
int * d_inv_p
d_inv_p points to the start location for posting list array in GPU memory.
Definition: inv_table.h:55
void knn_search_MT(std::vector< genie::table::inv_table *> &table, std::vector< std::vector< genie::query::Query >> &queries, std::vector< std::vector< int > > &h_topk, std::vector< std::vector< int > > &h_topk_count, std::vector< GPUGenie_Config > &config)
void setCompression(genie::compression::COMPRESSION_TYPE compression)
unsigned long long getTime()
Get system time.
Definition: Timing.cc:22
The declaration for class inv_table.
Definition: inv_table.h:41
void knn_search_after_preprocess(GPUGenie_Config &config, genie::table::inv_table *&_table, std::vector< int > &result, std::vector< int > &result_count)
Definition: interface.cc:282
void swap(int *position, int offset1, int offset2)
Definition: interface.cc:52
This struct is used to construct query in multirange mode.
Definition: FileReader.h:29
void invert_subsequence(std::vector< std::vector< int >> &vin)
void knn_search_for_csv_data(std::vector< int > &result, std::vector< int > &result_count, GPUGenie_Config &config)
Definition: interface.cc:758
void Init(genie::Config &config)
Definition: init.cu:7
void load_query(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
Definition: interface.cc:403
int get_total_num_of_table() const
return the total_num_of_table.
Definition: inv_table.cu:225
void load_table(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
Definition: interface.cc:316
void apply_adaptive_query_range()
Construct query in adaptice range mode.
Definition: query.cc:196
void append(inv_list &inv)
Append an inv_list to the inv_table.
Definition: inv_table.cu:107
int get_min_value_sequence()
Get the min value for sequences&#39; elements in this inv_table.
Definition: inv_table.cu:359
void merge_knn_results_from_multiload(std::vector< std::vector< int > > &_result, std::vector< std::vector< int > > &_result_count, std::vector< int > &result, std::vector< int > &result_count, int table_num, int topk, int query_num, unsigned int iteration_times)
Definition: interface.cc:114
Definitions about configurations that can be set by users.
Definition: interface.h:71
virtual void build(size_t max_length, bool use_load_balance)
Build the inv_table.
Definition: inv_table.cu:268
void get_rowID_offset(std::vector< int > &result, std::vector< int > &resultID, std::vector< int > &resultOffset, unsigned int shift_bits)
int shift_bits_sequence
This variable is used to tell the number of bits shifted for recording gram in different position...
Definition: inv_table.h:75
void reset_device()
clear gpu memory
Definition: interface.cc:912
std::vector< genie::utility::attr_t > * multirange_query_points
Definition: interface.h:94
Collection of knn functions.
void load_query_sequence(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
void knn_bijectMap_MT(std::vector< genie::table::inv_table *> &table, std::vector< std::vector< genie::query::Query > > &queries, std::vector< thrust::device_vector< int > > &d_top_indexes, std::vector< thrust::device_vector< int > > &d_top_count, std::vector< int > &hash_table_size, std::vector< int > &max_load, int bitmap_bits)
int partition(int *id_array, int *count_array, const int left, const int right)
Definition: interface.cc:60
Record run-time information.
uint32_t u32
Definition: match_common.h:18
Config class holds all user configurable settings of GENIE.
Definition: config.h:20
void attr(int index, int low, int up, float weight, int order)
Modify the matching range and weight of an attribute.
Definition: query.cc:76
unsigned long long u64
A type definition for a 64-bit unsigned integer.
Definition: match_common.h:19
bool is_stored_in_gpu
is_stored_in_gpu tell whether inverted index structure is pre-stored inside gpu memory ...
Definition: inv_table.h:70
void load_query_multirange(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, GPUGenie_Config &config)
Definition: interface.cc:427
void knn_search(std::vector< int > &result, std::vector< int > &result_count, GPUGenie_Config &config)
Definition: interface.cc:779
void set_table_index(int attr_index)
Set the table_index to &#39;index&#39;.
Definition: inv_table.cu:208
#define GPUGENIE_DEFAULT_WEIGHT
Definition: interface.h:32
std::vector< std::vector< int > > * data_points
Definition: interface.h:81
double getInterval(unsigned long long start, unsigned long long stop)
Calculate time interval from start to end.
Definition: Timing.cc:36
void print1DVector(vector< int > &toPrint)
Definition: interface.cc:102
Functions about getting system time.
void set_min_value_sequence(int min_value)
Used in sequence search. To set the min_value for all sequences&#39; element.
Definition: inv_table.cu:353
void knn(genie::table::inv_table &table, std::vector< genie::query::Query > &queries, thrust::device_vector< int > &d_top_indexes, thrust::device_vector< int > &d_top_count, int hash_table_size, int max_load, int bitmap_bits)
void set_total_num_of_table(int num)
Set the total_num_of_table to &#39;num&#39;.
Definition: inv_table.cu:213
void load_table_sequence(genie::table::inv_table &table, std::vector< std::vector< int > > &data_points, GPUGenie_Config &config)
void sequence_to_gram(std::vector< std::vector< int > > &sequences, std::vector< std::vector< int > > &gram_data, int max_value, int gram_length)
void setUncompressedPostingListMaxLength(size_t length)
define class inv_compre_table
void quicksortIdCount(int *id_array, int *count_array, const int left, const int right)
Definition: interface.cc:91
bool use_load_balance
Whether this query set is to use load balance.
Definition: query.h:125
This class manages one inverted list.
Definition: inv_list.h:23
void invert(std::vector< int > &vin)
int get_max_value_sequence()
Get the max value.
Definition: inv_table.cu:371
void selectivity(float s)
Set the selectivity.
Definition: query.cc:131
void sequence_reduce_to_ground(std::vector< std::vector< int > > &data, std::vector< std::vector< int > > &converted_data, int &min_value, int &max_value)
void set_max_value_sequence(int max_value)
Set the max value for all sequence. Compare to set_min_value_sequence()
Definition: inv_table.cu:365
void invert_sequence(std::vector< std::vector< int >> &vin, int shift_bits)
This is used for sequence match.
Definition: inv_list.cc:87
void append_sequence(inv_list &inv)
append inv_list for sequence search
Definition: inv_table.cu:130