GENIE
query.cc
Go to the documentation of this file.
1 
5 #include <algorithm>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <stdexcept>
9 #include <cmath>
10 #include <unordered_map>
11 #include <iostream>
12 
14 #include <genie/utility/Logger.h>
15 
16 using namespace std;
17 using namespace genie::table;
18 using namespace genie::utility;
19 
20 #include "query.h"
21 
22 typedef unsigned int u32;
23 typedef unsigned long long u64;
24 
26 {
27  _ref_table = NULL;
28  _attr_map.clear();
29  _dim_map.clear();
30  _topk = 1;
31  _selectivity = -1.0f;
32  _index = -1;
33  _count = -1;
34  is_load_balanced = false;
35  use_load_balance = false;
36 }
37 
39 {
40  _ref_table = ref;
41  _attr_map.clear();
42  _dim_map.clear();
43  _topk = 1;
44  _selectivity = -1.0f;
45  _index = index;
46  _count = -1;
47  is_load_balanced = false;
48  use_load_balance = false;
49 }
50 
52 {
53  _ref_table = &ref;
54  _attr_map.clear();
55  _dim_map.clear();
56  _topk = 1;
57  _selectivity = -1.0f;
58  _index = index;
59  _count = 0;
60  is_load_balanced = false;
61  use_load_balance = false;
62 }
63 
64 genie::query::Query::dim::dim(int query, int order, int start_pos, int end_pos, float weight)
65  : query(query),
66  order(order),
67  start_pos(start_pos),
68  end_pos(end_pos),
69  weight(weight) {}
70 
72 {
73  return _ref_table;
74 }
75 
76 void genie::query::Query::attr(int index, int low, int up, float weight, int order)
77 {
78  attr(index, low, up, weight, -1, order);
79 }
80 
81 void genie::query::Query::attr(int index, int value, float weight,
82  float selectivity, int order)
83 {
84  attr(index, value, value, weight, selectivity, order);
85 }
86 
87 void genie::query::Query::attr(int index, int low, int up, float weight,
88  float selectivity, int order)
89 {
90  if (index < 0 || index >= _ref_table->m_size())
91  return;
92 
93  range new_attr;
94  new_attr.low = low;
95  new_attr.up = up;
96  new_attr.weight = weight;
97  new_attr.dim = index;
98  new_attr.query = _index;
99  new_attr.order = order;
100  new_attr.low_offset = 0;
101  new_attr.up_offset = 0;
102  new_attr.selectivity = selectivity;
103 
104  if (_attr_map.find(index) == _attr_map.end())
105  {
106  //std::vector<range>* new_range_list = new std::vector<range>;
107  //_attr_map[index] = new_range_list;
108  _attr_map[index] = vector<range>();
109  }
110 
111  _attr_map[index].push_back(new_attr);
112  _count++;
113 }
114 
115 
117 {
118  if (_attr_map.find(index) == _attr_map.end())
119  {
120  return;
121  }
122  //_count -= _attr_map[index]->size();
123  //_attr_map[index]->clear();
124  //free(_attr_map[index]);
125  _count -= _attr_map[index].size();
126  _attr_map[index].clear();
127  _attr_map[index].shrink_to_fit();
128  _attr_map.erase(index);
129 }
130 
132 {
133  _selectivity = s;
134 }
135 
137 {
138  return _selectivity;
139 }
140 
142 {
143  this->build();
144 
145  if(max_load <= 0)
146  {
147  Logger::log(Logger::ALERT, "Please set a valid max_load.");
148  return;
149  }
150 
151  _dims.clear();
152 
153  for(auto di = _dim_map.begin() ; di != _dim_map.end(); ++di)
154  {
155  std::vector<dim>& dims = di->second;
156  for(unsigned int i = 0; i < dims.size(); ++i)
157  {
158  unsigned int length = dims[i].end_pos - dims[i].start_pos;
159  if((unsigned int)max_load > length)
160  {
161  _dims.push_back(dims[i]);
162  continue;
163  }
164  unsigned int j = 1;
165  for(; max_load*j <= length; ++j)
166  {
167  dim new_dim;
168  new_dim.query = dims[i].query;
169  new_dim.order = dims[i].order;
170  new_dim.weight = dims[i].weight;
171  new_dim.start_pos = max_load*(j-1) + dims[i].start_pos;
172  new_dim.end_pos = new_dim.start_pos + max_load;
173  _dims.push_back(new_dim);
174  }
175  if(max_load*(j-1) != length)
176  {
177  dim new_dim;
178  new_dim.query = dims[i].query;
179  new_dim.order = dims[i].order;
180  new_dim.weight = dims[i].weight;
181  new_dim.start_pos = max_load*(j-1) + dims[i].start_pos;
182  new_dim.end_pos = dims[i].end_pos;
183  _dims.push_back(new_dim);
184  }
185 
186 
187  }
188  }
189 
190  this->is_load_balanced = true;
191 }
192 
193 
194 
195 
197 {
198  inv_table& table = *_ref_table;
199 
200  if (table.build_status() == inv_table::not_builded)
201  {
202  Logger::log(Logger::ALERT,
203  "Please build the inverted table before applying adaptive query range.");
204  return;
205  }
206 
207  u32 global_threshold = _selectivity > 0 ? u32(ceil(_selectivity * table.i_size())) : 0;
208  u32 local_threshold;
209 
210  for (auto di = _attr_map.begin(); di != _attr_map.end(); ++di)
211  {
212  std::vector<range>* ranges = &(di->second);
213  int index = di->first;
214 
215  for (unsigned int i = 0; i < ranges->size(); ++i)
216  {
217  range& d = ranges->at(i);
218  if (d.selectivity > 0)
219  {
220  local_threshold = ceil(d.selectivity * table.i_size());
221  }
222  else if (_selectivity > 0)
223  {
224  local_threshold = global_threshold;
225  }
226  else
227  {
228  Logger::log(Logger::ALERT, "Please set valid selectivity!");
229  return;
230  }
231 
232  unsigned int count = 0;
233  for (int vi = d.low; vi <= d.up; ++vi)
234  {
235  if(!table.list_contain(index, vi))
236  continue;
237  count += table.get_posting_list_size(index, vi);
238  }
239  while (count < local_threshold)
240  {
241  if (d.low > 0)
242  {
243  d.low--;
244  if(!table.list_contain(index, d.low))
245  continue;
246  count += table.get_posting_list_size(index, d.low);
247  }
248  if(!table.list_contain(index, d.up+1))
249  {
250  if (d.low == 0)
251  break;
252  }
253  else
254  {
255  d.up++;
256  count += table.get_posting_list_size(index, d.up);
257  }
258  }
259  }
260  }
261 }
262 
264 {
265  _topk = k;
266 }
267 
269 {
270  return _topk;
271 }
272 
274 {
275  int low, up;
276  float weight;
277  inv_table& table = *_ref_table;
278  unordered_map<size_t, int>& inv_index_map = *table.inv_index_map();
279  vector<int>& inv_pos = *table.inv_pos();
280 
281  for (auto di = _attr_map.begin(); di != _attr_map.end(); ++di)
282  {
283  int index = di->first;
284  std::vector<range>& ranges = di->second;
285  int d = index << _ref_table->shifter();
286 
287  if (ranges.empty())
288  {
289  continue;
290  }
291 
292  if (_dim_map.find(index) == _dim_map.end())
293  {
294  //std::vector<dim>* new_list = new std::vector<dim>;
295  //_dim_map[index] = new_list;
296  _dim_map[index] = vector<dim>();
297  }
298 
299  for (unsigned int i = 0; i < ranges.size(); ++i)
300  {
301 
302  range& ran = ranges[i];
303  low = ran.low;
304  up = ran.up;
305  weight = ran.weight;
306 
307  if(low > up || low > table.get_upperbound_of_list(index) || up < table.get_lowerbound_of_list(index))
308  {
309  continue;
310  }
311 
312  dim new_dim;
313  new_dim.weight = weight;
314  new_dim.query = _index;
315  new_dim.order = ran.order;
316 
317  low = low < table.get_lowerbound_of_list(index)?table.get_lowerbound_of_list(index):low;
318 
319  up = up > table.get_upperbound_of_list(index)?table.get_upperbound_of_list(index):up;
320 
321 
322  int _min, _max;
323 
324  _min = d + low - table.get_lowerbound_of_list(index);
325  _max = d + up - table.get_lowerbound_of_list(index);
326 
327  new_dim.start_pos = inv_pos[inv_index_map.find(static_cast<size_t>(_min))->second];
328  new_dim.end_pos = inv_pos[inv_index_map.find(static_cast<size_t>(_max+1))->second];
329 
330  _dim_map[index].push_back(new_dim);
331  }
332  }
333 }
334 
335 void genie::query::Query::build(vector<dim> &dims)
336 {
337  int low, up, order, start_pos, end_pos;
338  float weight;
339  unordered_map<size_t, int> &inv_index_map = *_ref_table->inv_index_map();
340  vector<int> &inv_pos = *_ref_table->inv_pos();
341 
342  for (auto di = _attr_map.begin(); di != _attr_map.end(); ++di)
343  {
344  int index = di->first;
345  vector<range> &ranges = di->second;
346  int d = index << _ref_table->shifter();
347 
348  if (ranges.empty())
349  continue;
350 
351  for (size_t i = 0; i < ranges.size(); ++i)
352  {
353  range &ran = ranges[i];
354  low = ran.low;
355  up = ran.up;
356  weight = ran.weight;
357  order = ran.order;
358 
359  if (low > up || low > _ref_table->get_upperbound_of_list(index) || up < _ref_table->get_lowerbound_of_list(index))
360  continue;
361 
362  low = low < _ref_table->get_lowerbound_of_list(index) ? _ref_table->get_lowerbound_of_list(index) : low;
363  up = up > _ref_table->get_upperbound_of_list(index) ? _ref_table->get_upperbound_of_list(index) : up;
364 
365  int _min, _max;
366 
367  _min = d + low - _ref_table->get_lowerbound_of_list(index);
368  _max = d + up - _ref_table->get_lowerbound_of_list(index);
369 
370  start_pos = inv_pos[inv_index_map.find(static_cast<size_t>(_min))->second];
371  end_pos = inv_pos[inv_index_map.find(static_cast<size_t>(_max+1))->second];
372 
373  dims.emplace_back(_index, order, start_pos, end_pos, weight);
374  }
375  }
376 }
377 
379 {
380  int low, up;
381  float weight;
382  inv_table& table = *_ref_table;
383  unordered_map<size_t, int>& inv_index_map = *table.inv_index_map();
384  vector<int>& inv_pos = *table.inv_pos();
385 
386  //unordered_map<int, int> _distinct;
387 
388  for (auto di = _attr_map.begin(); di != _attr_map.end(); ++di)
389  {
390  //_distinct.clear();
391  int index = di->first;
392  unordered_map<int, int>& _distinct = *table.get_distinct_map(index);
393  if(_distinct.size() <= 0) continue;
394 
395  unordered_map<int, int>::iterator itt = _distinct.begin();
396 
397  std::vector<range>& ranges = di->second;
398  int d = index << _ref_table->shifter();
399 
400 
401  if (ranges.empty())
402  {
403  continue;
404  }
405 
406  if (_dim_map.find(index) == _dim_map.end())
407  {
408  //std::vector<dim>* new_list = new std::vector<dim>;
409  //_dim_map[index] = new_list;
410  _dim_map[index] = vector<dim>();
411  }
412 
413  for (unsigned int i = 0; i < ranges.size(); ++i)
414  {
415 
416  range& ran = ranges[i];
417  low = ran.low;
418  up = ran.up;
419  weight = ran.weight;
420  //Here low must equal up
421  //vector<int>::iterator it = std::find(distinct_value.begin(), distinct_value.end(), low);
422  unordered_map<int, int>::iterator it = _distinct.find(low);
423  if(it != _distinct.end())
424  {
425  low = it->second;
426  up = low;
427  }
428  else
429  continue;
430 
431 
432  if(low > up || low > table.get_upperbound_of_list(index) || up < table.get_lowerbound_of_list(index))
433  {
434  continue;
435  }
436 
437  dim new_dim;
438  new_dim.weight = weight;
439  new_dim.query = _index;
440 
441  if(low < table.get_lowerbound_of_list(index))
442  continue;
443 
444  if(up > table.get_upperbound_of_list(index))
445  continue;
446 
447  int _min, _max;
448 
449  _min = d + low - table.get_lowerbound_of_list(index);
450  _max = d + up - table.get_lowerbound_of_list(index);
451 
452  new_dim.start_pos = inv_pos[inv_index_map.find(static_cast<size_t>(_min))->second];
453  new_dim.end_pos = inv_pos[inv_index_map.find(static_cast<size_t>(_max+1))->second];
454 
455  _dim_map[index].push_back(new_dim);
456  }
457 
458 
459  }
460 }
461 
462 
463 #ifdef GENIE_COMPR
464 void genie::query::Query::build_compressed(int max_load)
465 {
466  inv_compr_table& table = *dynamic_cast<inv_compr_table*>(_ref_table);
467  unordered_map<size_t, int>& inv_index_map = *table.inv_index_map();
468  vector<int>& inv_pos = *table.inv_pos();
469 
470  for (std::map<int, std::vector<range>>::iterator di = _attr_map.begin();
471  di != _attr_map.end(); ++di)
472  {
473  int index = di->first;
474  std::vector<range> &ranges = di->second;
475  int d = index << _ref_table->shifter();
476 
477  if (ranges.empty())
478  continue;
479 
480  if (_dim_map.find(index) == _dim_map.end())
481  _dim_map[index] = std::vector<dim>();
482 
483  for (unsigned int i = 0; i < ranges.size(); ++i)
484  {
485 
486  range& ran = ranges[i];
487  int low = ran.low;
488  int up = ran.up;
489 
490  if(low > up || low > table.get_upperbound_of_list(index) || up < table.get_lowerbound_of_list(index))
491  continue;
492 
493  low = std::max(low, table.get_lowerbound_of_list(index));
494  up = std::min(up, table.get_upperbound_of_list(index));
495 
496  int _min, _max;
497  _min = d + low - table.get_lowerbound_of_list(index);
498  _max = d + up - table.get_lowerbound_of_list(index);
499 
500  // Index to inv_pos array of the first valid inverted list
501  int beginInvListIndex = inv_index_map.find(static_cast<size_t>(_min))->second;
502  // Index to inv_pos array after the last valid inverted list
503  int endInvListIndex = inv_index_map.find(static_cast<size_t>(_max+1))->second;
504 
505  assert (endInvListIndex > beginInvListIndex);
506 
507  if (max_load <= 0){
508  // Check that we are definitely not using multirange, i.e. check there is only one inv list in the
509  // compiled query, in case max_load is not specified
510  assert(beginInvListIndex + 1 == endInvListIndex);
511  } else {
512  // Check that all the inverted lists in the compiled query will be smaller than max load
513  for (int ipos = beginInvListIndex; ipos < endInvListIndex; ipos++){
514  assert(inv_pos[ipos+1] - inv_pos[ipos+1] <= max_load);
515  }
516  }
517 
518  // Generate compiled queries
519  std::vector<dim> &compiledQueriesVec = _dim_map[index];
520  for (int ipos = beginInvListIndex; ipos < endInvListIndex; ipos++){
521  dim new_dim;
522  new_dim.weight = ran.weight;
523  new_dim.query = _index;
524  new_dim.order = ran.order;
525  new_dim.start_pos = inv_pos[ipos];
526  new_dim.end_pos = inv_pos[ipos+1];
527 
528  compiledQueriesVec.push_back(new_dim);
529  }
530  }
531 
532  }
533 }
534 #endif
535 
536 int genie::query::Query::dump(vector<dim>& vout)
537 {
538  if (is_load_balanced)
539  {
540  for(unsigned int i = 0; i < _dims.size(); ++i)
541  {
542  vout.push_back(_dims[i]);
543  }
544  return _dims.size();
545  }
546  int count = 0;
547  for (std::map<int, std::vector<dim>>::iterator di = _dim_map.begin();
548  di != _dim_map.end(); ++di)
549  {
550  std::vector<dim> &ranges = di->second;
551  count += ranges.size();
552 
553  //vector<int>& _inv_ = *_ref_table->inv();
554  for (unsigned int i = 0; i < ranges.size(); ++i)
555  {
556  vout.push_back(ranges[i]);
557  }
558  }
559  return count;
560 }
561 
562 
563 int genie::query::Query::dump(vector<range>& ranges)
564 {
565  int count = 0;
566  ranges.clear();
567  for (std::map<int, std::vector<range>>::iterator di = _attr_map.begin();
568  di != _attr_map.end(); ++di)
569  {
570  std::vector<range> &rangesInAttr = di->second;
571  count += rangesInAttr.size();
572 
573  for (range &r : rangesInAttr)
574  ranges.push_back(r);
575  }
576  return count;
577 }
578 
580 {
581  return _index;
582 }
583 
585 {
586  return _count;
587 }
588 
590 {
591  Logger::log(Logger::DEBUG, "This query has %d dimensions.",
592  _attr_map.size());
593  int count = limit;
594  for (auto di = _attr_map.begin(); di != _attr_map.end(); ++di)
595  {
596  std::vector<range>& ranges = di->second;
597  for (unsigned int i = 0; i < ranges.size(); ++i)
598  {
599  if (count == 0)
600  return;
601  if (count > 0)
602  count--;
603  range& d = ranges[i];
604  Logger::log(Logger::DEBUG,
605  "dim %d from query %d: low %d(offset %d), up %d(offset %d), weight %.2f, selectivity %.2f.",
606  d.dim, d.query, d.low, d.low_offset, d.up, d.up_offset,
607  d.weight, d.selectivity);
608  }
609  }
610 }
611 
612 
void build_and_apply_load_balance(int max_load)
The function would construct query in the setting where load balance feature is on.
Definition: query.cc:141
std::unordered_map< size_t, int > * inv_index_map()
Definition: inv_table.cu:255
virtual std::vector< int > * inv_pos()
Definition: inv_table.cu:261
int get_upperbound_of_list(int attr_index)
Definition: inv_table.cu:184
The declaration for class inv_table.
Definition: inv_table.h:41
void build_sequence()
build query dim for sequence search
Definition: query.cc:378
unsigned int u32
Definition: query.cc:22
int topk()
Get top k matches.
Definition: query.cc:268
void print(int limit)
Print out the information of all dims.
Definition: query.cc:589
Declaration of query class.
genie::table::inv_table * ref_table()
The refrenced table&#39;s pointer.
Definition: query.cc:71
int dump(std::vector< dim > &vout)
virtual std::vector< int > * inv_pos()
std::unordered_map< int, int > * get_distinct_map(int dim)
Definition: inv_table.cu:102
std::vector< dim > _dims
Collection of dims of all queries.
Definition: query.h:130
void apply_adaptive_query_range()
Construct query in adaptice range mode.
Definition: query.cc:196
Query class includes the functions for processing queries based on user&#39;s input.
void clear()
Clear the inv_table.
Definition: inv_table.cu:47
int get_lowerbound_of_list(int attr_index)
Definition: inv_table.cu:193
unsigned long long u64
Definition: query.cc:23
int count_ranges()
Get value of _ count.
Definition: query.cc:584
The first-step struct for processing queries.
Definition: query.h:43
float selectivity()
The selectivity of the current settings.
Definition: query.cc:136
int get_posting_list_size(int attr_index, int value)
Definition: inv_table.cu:164
void clear_dim(int index)
Delete the dim at index.
Definition: query.cc:116
Record run-time information.
void attr(int index, int low, int up, float weight, int order)
Modify the matching range and weight of an attribute.
Definition: query.cc:76
The second-step struct for processing queries.
Definition: query.h:59
int index()
Get index of the query.
Definition: query.cc:579
bool is_load_balanced
Whether this query set has been applied load balance feature.
Definition: query.h:121
unsigned int u32
Definition: query.h:24
define class inv_compre_table
bool list_contain(int attr_index, int value)
Test whether a value is possible for an specific attribute.
Definition: inv_table.cu:173
void build()
Construct the query based on a normal table.
Definition: query.cc:273