BitMagic-C++
xsample05.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example xsample05.cpp
20 */
21 
22 /*! \file xsample05.cpp
23  \brief Example: Example on how to use bit-transposed string sparse vector
24 
25  Illustrates how to build a sparse vector, serialize it to disk,
26  load back and do search or binary hybrid search.
27 
28  \sa bm::str_sparse_vector<>
29 
30 */
31 
32 #include <iostream>
33 #include <sstream>
34 #include <chrono>
35 #include <regex>
36 #include <time.h>
37 #include <stdio.h>
38 
39 
40 #include <vector>
41 #include <map>
42 #include <chrono>
43 #include <map>
44 #include <utility>
45 #include <algorithm>
46 #include <random>
47 
48 using namespace std;
49 
50 //#define BMAVX2OPT
51 
52 #include "bm.h"
53 #include "bmalgo.h"
54 #include "bmserial.h"
55 #include "bmrandom.h"
56 #include "bmstrsparsevec.h"
57 #include "bmsparsevec_algo.h"
58 #include "bmsparsevec_serial.h"
59 #include "bmalgo_similarity.h"
60 #include "bmsparsevec_util.h"
61 
62 
63 #include "bmdbg.h"
64 #include "bmtimer.h"
65 
66 /// Print help
67 static
68 void show_help()
69 {
70  std::cerr
71  << "BitMagic Dictionary Search Sample (c) 2018" << std::endl
72  << "-idict file-name -- input set file to parse" << std::endl
73  << "-svout spase vector output -- sparse vector name to save" << std::endl
74  << "-svin sparse vector input -- sparse vector file name to load " << std::endl
75  << "-diag -- run diagnostics" << std::endl
76  << "-bench -- run benchmarks" << std::endl
77  << "-timing -- collect timings" << std::endl
78  ;
79 }
80 
81 
82 // Arguments
83 //
84 std::string sv_out_name;
85 std::string sv_in_name;
86 std::string i_name;
87 bool is_diag = false;
88 bool is_timing = false;
89 bool is_bench = false;
90 
91 // Parse command line arguments
92 static
93 int parse_args(int argc, char *argv[])
94 {
95  for (int i = 1; i < argc; ++i)
96  {
97  std::string arg = argv[i];
98  if ((arg == "-h") || (arg == "--help"))
99  {
100  show_help();
101  return 0;
102  }
103 
104  if (arg == "-svout" || arg == "--svout")
105  {
106  if (i + 1 < argc)
107  {
108  sv_out_name = argv[++i];
109  }
110  else
111  {
112  std::cerr << "Error: -svout requires file name" << std::endl;
113  return 1;
114  }
115  continue;
116  }
117 
118  if (arg == "-svin" || arg == "--svin")
119  {
120  if (i + 1 < argc)
121  {
122  sv_in_name = argv[++i];
123  }
124  else
125  {
126  std::cerr << "Error: -svin requires file name" << std::endl;
127  return 1;
128  }
129  continue;
130  }
131 
132  if (arg == "-idict" || arg == "--idict" )
133  {
134  if (i + 1 < argc)
135  {
136  i_name = argv[++i];
137  }
138  else
139  {
140  std::cerr << "Error: -idict requires file name" << std::endl;
141  return 1;
142  }
143  continue;
144  }
145 
146  if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
147  {
148  is_diag = true;
149  continue;
150  }
151  if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
152  {
153  is_timing = true;
154  continue;
155  }
156  if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
157  {
158  is_bench = true;
159  continue;
160  }
161 
162  std::cerr << "Error: unknown argument: " << arg << std::endl;
163  return 1;
164 
165 
166  } // for i
167  return 0;
168 }
169 
170 
171 // Global types
172 //
174 typedef vector<string> string_vector;
175 
176 
177 
179 
180 /// Parse the input file and extract dictionary values.
181 ///
182 static
183 int load_dict_report(const std::string& fname, string_vector& str_vec)
184 {
185  bm::chrono_taker tt1("1. parse input data ", 1, &timing_map);
186 
187  std::ifstream fin(fname.c_str(), std::ios::in);
188  if (!fin.good())
189  return -1;
190 
191  std::string line;
192  std::regex reg("[|]");
193  std::sregex_token_iterator it_end;
194 
195  string trim_chars("\" ");
196 
197  for (unsigned i = 0; std::getline(fin, line); ++i)
198  {
199  if (line.empty() || !isdigit(line.front()))
200  continue;
201 
202  // regex based tokenizer
203  std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
204  std::vector<std::string> line_vec(it, it_end);
205  if (line_vec.empty())
206  continue;
207  try
208  {
209  // trip the extra chars
210  string& col13 = line_vec.at(13);
211  col13.erase(0, col13.find_first_not_of(trim_chars));
212  col13.erase(col13.find_last_not_of(trim_chars) + 1);
213 
214  if (!col13.empty())
215  str_vec.emplace_back(col13);
216  }
217  catch (std::exception&)
218  {
219  // just ignore (not ideal, but ok for a sketch ETL)
220  }
221  if (i % 10000 == 0)
222  {
223  cout << "\rReading input file: " << i << flush;
224  }
225  } // for
226  cout << endl;
227  return 0;
228 }
229 
230 /// Compare STL vector with bit-transposed container to check correcness
231 ///
232 static
233 void check_sparse(const str_sparse_vect& str_sv, const string_vector& str_vec)
234 {
235  if (str_vec.size() != str_sv.size())
236  throw runtime_error("Error. size() comparison failed!");
237  string s;
238  for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
239  {
240  str_sv.get(i, s);
241  const string& s_control = str_vec[i];
242  if (s != s_control)
243  throw runtime_error("Error. element comparison failed!");
244  } // for
245  std::cout << "Check ok. Dictionary size = " << str_sv.size() << std:: endl;
246 }
247 
248 const unsigned benchmark_max = 15000; // benchmark sampling size
249 
250 /// Sample a few random terms out of collection
251 static
252 void pick_benchmark_set(string_vector& bench_vec, string_vector& bench_vec_not_found, const string_vector& str_vec)
253 {
254  std::random_device rand_dev;
255  std::mt19937 gen(rand_dev()); // mersenne_twister_engine
256  std::uniform_int_distribution<unsigned> rand_dis(0, unsigned(str_vec.size()-1)); // generate uniform numebrs for [1, vector_max]
257 
258  bm::bvector<> bv;
259 
260  bench_vec.resize(0);
261  for (unsigned i = 0; i < benchmark_max; ++i)
262  {
263  unsigned idx;
264  while (true)
265  {
266  idx = unsigned(rand_dis(gen));
267  if (bv.test(idx)) // make sure benchmark example is not repeated
268  continue;
269  if (idx < str_vec.size())
270  bench_vec.push_back(str_vec[idx]);
271  break;
272  }
273  bv.set(idx); // mark as set
274 
275  // generate not-found case by modifying a letter in an existing sample
276  {
277  string str_nf = str_vec[idx];
278  string::reverse_iterator rit = str_nf.rbegin();
279  string::reverse_iterator rit_end = str_nf.rend();
280  for (; rit != rit_end; ++rit)
281  {
282  char ch = *rit;
283  int a = rand() % 26 + int('A'); // generate random letter
284  ch = char(a);
285  *rit = ch;
286  auto it = std::lower_bound(str_vec.begin(), str_vec.end(), str_nf);
287  if (it == str_vec.end() || *it != str_nf) // check if not found
288  {
289  bench_vec_not_found.push_back(str_nf);
290  break;
291  }
292  } // for rit
293  }
294 
295  } // for
296  cout << endl;
297 }
298 
299 static
300 void run_benchmark(const str_sparse_vect& str_sv, const string_vector& str_vec)
301 {
302  string_vector bench_vec;
303  string_vector bench_vec_not_found; // vector for impossible dictionary items
304 
305  pick_benchmark_set(bench_vec, bench_vec_not_found, str_vec);
306 
307  bm::bvector<> bv1, bv2, bv3, bv4;
308 
309  cout << "Picked " << bench_vec.size() << " / "
310  << bench_vec_not_found.size() << " samples. Running benchmarks."
311  << endl;
312 
313  unsigned bench_size = unsigned(bench_vec.size());
314  {
315  {
316  bm::chrono_taker tt1("3. std::lower_bound() search", bench_size, &timing_map);
317  for (const string& term : bench_vec)
318  {
319  auto it = std::lower_bound(str_vec.begin(), str_vec.end(), term);
320  if (it != str_vec.end())
321  {
322  string_vector::size_type idx =
323  string_vector::size_type(std::distance(str_vec.begin(), it));
324  bv1.set(unsigned(idx));
325  }
326  } // for
327  }
328  {
329  bm::chrono_taker tt2("3a. std::lower_bound() search (empty)", bench_size, &timing_map);
330  for (const string& term : bench_vec_not_found)
331  {
332  std::lower_bound(str_vec.begin(), str_vec.end(), term);
333  } // for
334  }
335  }
336 
337  {
338  // construct std::map<> (RB-tree)
339  std::map<string, unsigned> str_map;
340  for (string_vector::size_type i = 0; i < str_vec.size(); ++i)
341  {
342  const string& s = str_vec[i];
343  str_map[s] = unsigned(i);
344  } // for
345  {
346  bm::chrono_taker tt1("4. std::map<> search", bench_size, &timing_map);
347  for (const string& term : bench_vec)
348  {
349  auto it = str_map.find(term);
350  if (it != str_map.end())
351  {
352  bv2.set(unsigned(it->second));
353  }
354  } // for
355  }
356  {
357  bm::chrono_taker tt2("4a. std::map<> search (empty)", bench_size, &timing_map);
358  for (const string& term : bench_vec_not_found)
359  {
360  auto it = str_map.find(term);
361  if (it != str_map.end())
362  {
363  cerr << "empty search returned value..." << endl;
364  }
365  } // for
366  }
367  }
368 
369  {
371  {
372  bm::chrono_taker tt1("5. bm::sparse_vector_scanner<> search", bench_size, &timing_map);
373  for (const string& term : bench_vec)
374  {
375  unsigned pos;
376  bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
377  if (found)
378  {
379  bv3.set(pos);
380  }
381  } // for
382  }
383  {
384  bm::chrono_taker tt1("5a. bm::sparse_vector_scanner<> search (empty)", bench_size, &timing_map);
385  for (const string& term : bench_vec_not_found)
386  {
387  unsigned pos;
388  bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
389  if (found)
390  {
391  cerr << "scanner empty search returned value..." << endl;
392  }
393  } // for
394  }
395  }
396 
397  {
399  scanner.bind(str_sv, true); // attach SV as permanent search parameter to share cached values
400 
401  {
402  bm::chrono_taker tt1("6. bm::sparse_vector_scanner<> binary search", bench_size, &timing_map);
403  for (const string& term : bench_vec)
404  {
405  unsigned pos;
406  bool found = scanner.bfind_eq_str(term.c_str(), pos);
407  if (found)
408  {
409  bv4.set(pos);
410  }
411  } // for
412  }
413  {
414  bm::chrono_taker tt2("6a. bm::sparse_vector_scanner<> binary search (empty)", bench_size, &timing_map);
415  for (const string& term : bench_vec_not_found)
416  {
417  unsigned pos;
418  bool found = scanner.bfind_eq_str(term.c_str(), pos);
419  if (found)
420  {
421  cerr << "scanner empty search returned value..." << endl;
422  }
423  } // for
424  }
425  }
426 
427  // various integrity checks
428  //
429  int cmp = bv1.compare(bv2);
430  if (cmp != 0)
431  throw runtime_error("Error. RB-search mismatch!");
432  cmp = bv1.compare(bv3);
433  if (cmp != 0)
434  throw runtime_error("Error. scanner mismatch!");
435 
436  cmp = bv1.compare(bv4);
437  if (cmp != 0)
438  throw runtime_error("Error. binary scanner mismatch!");
439 
440  if (bv1.count() != bench_size)
441  throw runtime_error("Error. Search result missing elements!");
442 
443 
444 }
445 
446 
447 int main(int argc, char *argv[])
448 {
449  if (argc < 3)
450  {
451  show_help();
452  return 1;
453  }
454 
455  string_vector str_vec; // dictionary vector (STL)
456  str_sparse_vect str_sv; // bit-transposed sparse vector
457 
458  try
459  {
460  auto ret = parse_args(argc, argv);
461  if (ret != 0)
462  {
463  return ret;
464  }
465  if (!i_name.empty())
466  {
467  auto res = load_dict_report(i_name, str_vec);
468  if (res != 0)
469  {
470  return res;
471  }
472  cout << "Loaded " << str_vec.size() << " dictionary names." << endl;
473 
474  std::sort(str_vec.begin(), str_vec.end());
475  }
476 
477  if (str_vec.size()) // load the sparse vector
478  {
479  bm::chrono_taker tt1("2. build sparse vector", 1, &timing_map);
480 
481  for (const string& term : str_vec)
482  str_sv.push_back(term);
483 
484  // build remapped (dense) sparse vector
485  // (this should be final), no more edits in this form
486  {
487  str_sparse_vect str_sv_remap;
488  str_sv_remap.remap_from(str_sv);
489  str_sv.swap(str_sv_remap);
490  }
491 
493  str_sv.optimize(tb); // memory optimization after load
494  }
495 
496  // save SV vector to file
497  if (!sv_out_name.empty() && !str_sv.empty())
498  {
499  bm::chrono_taker tt1("7. Save sparse vector", 1, &timing_map);
500  file_save_svector(str_sv, sv_out_name);
501  }
502 
503  if (!sv_in_name.empty())
504  {
505  {
506  bm::chrono_taker tt1("8. Load sparse vector", 1, &timing_map);
507  file_load_svector(str_sv, sv_in_name);
508  }
509  if (str_vec.empty())
510  {
511  for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
512  {
513  string s;
514  str_sv.get(i, s);
515  str_vec.emplace_back(std::move(s));
516  } // for
517  }
518  }
519 
520 
521  if (is_diag)
522  {
523  if (!str_sv.empty())
524  {
525  print_svector_stat(str_sv, true);
526  }
527 
528  if (str_vec.size())
529  {
530  size_t total_size = 0;
531  for (const string& term : str_vec)
532  {
533  total_size += term.size();
534  }
535  cout << "String dictionary size: "
536  << total_size / 1024 << "KB (" << total_size / (1024*1024) << "MB)"
537  << endl;
538  }
539 
540  if (str_sv.size() && str_vec.size())
541  {
542  cout << "Run full comparison check..." << endl;
543  check_sparse(str_sv, str_vec); // run a paranoiya check
544  cout << "Ok" << endl;
545  }
546  }
547 
548  if (is_bench) // run set of benchmarks
549  {
550  run_benchmark(str_sv, str_vec);
551  }
552 
553  if (is_timing) // print all collected timings
554  {
555  std::cout << std::endl << "Performance:" << std::endl;
557  }
558  }
559  catch (std::exception& ex)
560  {
561  std::cerr << "Error:" << ex.what() << std::endl;
562  return 1;
563  }
564 
565  return 0;
566 }
567 
bmsparsevec_algo.h
Algorithms for bm::sparse_vector.
bmrandom.h
Generation of random subset.
bm::chrono_taker
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
bm::bvector::size
size_type size() const BMNOEXCEPT
Returns bvector's capacity (number of bits it can store)
Definition: bm.h:1272
bm::str_sparse_vector::get
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
Definition: bmstrsparsevec.h:1295
bm::str_sparse_vector::push_back
void push_back(const StrType &str)
push back a string
Definition: bmstrsparsevec.h:532
BM_DECLARE_TEMP_BLOCK
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
sv_out_name
std::string sv_out_name
Definition: xsample05.cpp:84
bm::sparse_vector_scanner
algorithms for sparse_vector scan/search
Definition: bmsparsevec_algo.h:481
bm::bvector::compare
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition: bm.h:3185
bm::sparse_vector_scanner::bfind_eq_str
bool bfind_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
binary find first sparse vector element (string) Sparse vector must be sorted.
Definition: bmsparsevec_algo.h:1561
bmsparsevec_serial.h
Serialization for sparse_vector<>
bmalgo.h
Algorithms for bvector<> (main include)
is_timing
bool is_timing
Definition: xsample05.cpp:88
show_help
static void show_help()
Print help.
Definition: xsample05.cpp:68
is_diag
bool is_diag
Definition: xsample05.cpp:87
bm::str_sparse_vector::empty
bool empty() const
return true if vector is empty
Definition: bmstrsparsevec.h:642
bm::bvector<>
bm::bvector::set
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition: bm.h:3607
bm::str_sparse_vector::size
size_type size() const
return size of the vector
Definition: bmstrsparsevec.h:637
bm::str_sparse_vector
sparse vector for strings with compression using bit transposition method
Definition: bmstrsparsevec.h:56
bm::bvector::count
size_type count() const BMNOEXCEPT
population cout (count of ON bits)
Definition: bm.h:2220
str_sparse_vect
bm::str_sparse_vector< char, bm::bvector<>, 64 > str_sparse_vect
Definition: xsample05.cpp:173
bmstrsparsevec.h
string sparse vector based on bit-transposed matrix
bmsparsevec_util.h
gen
std::mt19937 gen(rand_dev())
benchmark_max
const unsigned benchmark_max
Definition: xsample05.cpp:248
timing_map
bm::chrono_taker::duration_map_type timing_map
Definition: xsample05.cpp:178
bm::chrono_taker::ct_time
@ ct_time
Definition: bmtimer.h:58
bmtimer.h
Timing utilities for benchmarking (internal)
sv_in_name
std::string sv_in_name
Definition: xsample05.cpp:85
bm::str_sparse_vector::size_type
bvector_type::size_type size_type
Definition: bmstrsparsevec.h:63
bm::str_sparse_vector::swap
void swap(str_sparse_vector &str_sv) BMNOEXCEPT
Definition: bmstrsparsevec.h:1146
parse_args
static int parse_args(int argc, char *argv[])
Definition: xsample05.cpp:93
is_bench
bool is_bench
Definition: xsample05.cpp:89
bm::sparse_vector_scanner::find_eq_str
bool find_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
find first sparse vector element (string)
Definition: bmsparsevec_algo.h:1515
bm::bvector::resize
void resize(size_type new_size)
Change size of the bvector.
Definition: bm.h:2282
bm::bvector::test
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition: bm.h:1454
bm::chrono_taker::duration_map_type
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
bmserial.h
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
bm::chrono_taker::print_duration_map
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:127
bmalgo_similarity.h
pick_benchmark_set
static void pick_benchmark_set(string_vector &bench_vec, string_vector &bench_vec_not_found, const string_vector &str_vec)
Sample a few random terms out of collection.
Definition: xsample05.cpp:252
check_sparse
static void check_sparse(const str_sparse_vect &str_sv, const string_vector &str_vec)
Compare STL vector with bit-transposed container to check correcness.
Definition: xsample05.cpp:233
i_name
std::string i_name
Definition: xsample05.cpp:86
string_vector
vector< string > string_vector
Definition: xsample05.cpp:174
bm::sparse_vector_scanner::bind
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
Definition: bmsparsevec_algo.h:1118
main
int main(int argc, char *argv[])
Definition: xsample05.cpp:447
bm::str_sparse_vector::optimize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *stat=0)
run memory optimization for all vector plains
Definition: bmstrsparsevec.h:1323
bm.h
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
run_benchmark
static void run_benchmark(const str_sparse_vect &str_sv, const string_vector &str_vec)
Definition: xsample05.cpp:300
load_dict_report
static int load_dict_report(const std::string &fname, string_vector &str_vec)
Parse the input file and extract dictionary values.
Definition: xsample05.cpp:183
rand_dis
std::uniform_int_distribution rand_dis(1, int(vector_max))
bm::str_sparse_vector::remap_from
void remap_from(const str_sparse_vector &str_sv)
Build remapping profile and load content from another sparse vector.
Definition: bmstrsparsevec.h:1600
rand_dev
std::random_device rand_dev
Definition: sample11.cpp:49