BitMagic-C++
xsample05.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample05.cpp
20*/
21
22/*! \file xsample05.cpp
23 \brief Example: Example on how to use bit-transposed string sparse vector
24
25 Illustrates how to build a sparse vector, serialize it to disk,
26 load back and do search or binary hybrid search.
27
28 \sa bm::str_sparse_vector<>
29
30*/
31
32#include <iostream>
33#include <sstream>
34#include <chrono>
35#include <regex>
36#include <time.h>
37#include <stdio.h>
38
39
40#include <vector>
41#include <map>
42#include <chrono>
43#include <map>
44#include <utility>
45#include <algorithm>
46#include <random>
47
48using namespace std;
49
50//#define BMAVX2OPT
51
52#include "bm.h"
53#include "bmalgo.h"
54#include "bmserial.h"
55#include "bmrandom.h"
56#include "bmstrsparsevec.h"
57#include "bmsparsevec_algo.h"
58#include "bmsparsevec_serial.h"
59#include "bmalgo_similarity.h"
60#include "bmsparsevec_util.h"
61
62
63#include "bmdbg.h"
64#include "bmtimer.h"
65#include "bmundef.h" /* clear the pre-proc defines from BM */
66
67/// Print help
68static
70{
71 std::cerr
72 << "BitMagic Dictionary Search Sample (c) 2018" << std::endl
73 << "-idict file-name -- input set file to parse" << std::endl
74 << "-svout spase vector output -- sparse vector name to save" << std::endl
75 << "-svin sparse vector input -- sparse vector file name to load " << std::endl
76 << "-remap -- re-mapping of string characters " << std::endl
77 << "-xor -- use XOR compression filtering" << std::endl
78 << "-diag -- run diagnostics" << std::endl
79 << "-bench -- run benchmarks" << std::endl
80 << "-timing -- collect timings" << std::endl
81 ;
82}
83
84
85// Arguments
86//
87std::string sv_out_name;
88std::string sv_in_name;
89std::string i_name;
90bool is_diag = false;
91bool is_timing = false;
92bool is_bench = false;
93bool is_remap = false;
94bool is_xor = false;
95
96// Parse command line arguments
97static
98int parse_args(int argc, char *argv[])
99{
100 for (int i = 1; i < argc; ++i)
101 {
102 std::string arg = argv[i];
103 if ((arg == "-h") || (arg == "--help"))
104 {
105 show_help();
106 return 0;
107 }
108
109 if (arg == "-svout" || arg == "--svout")
110 {
111 if (i + 1 < argc)
112 {
113 sv_out_name = argv[++i];
114 }
115 else
116 {
117 std::cerr << "Error: -svout requires file name" << std::endl;
118 return 1;
119 }
120 continue;
121 }
122
123 if (arg == "-svin" || arg == "--svin")
124 {
125 if (i + 1 < argc)
126 {
127 sv_in_name = argv[++i];
128 }
129 else
130 {
131 std::cerr << "Error: -svin requires file name" << std::endl;
132 return 1;
133 }
134 continue;
135 }
136
137 if (arg == "-idict" || arg == "--idict" )
138 {
139 if (i + 1 < argc)
140 {
141 i_name = argv[++i];
142 }
143 else
144 {
145 std::cerr << "Error: -idict requires file name" << std::endl;
146 return 1;
147 }
148 continue;
149 }
150
151 if (arg == "-remap" || arg == "--remap" || arg == "-r" || arg == "--r")
152 {
153 is_remap = true;
154 continue;
155 }
156 if (arg == "-xor" || arg == "--xor" || arg == "-x" || arg == "--x")
157 {
158 is_xor = true;
159 continue;
160 }
161 if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
162 {
163 is_diag = true;
164 continue;
165 }
166 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
167 {
168 is_timing = true;
169 continue;
170 }
171 if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
172 {
173 is_bench = true;
174 continue;
175 }
176
177 std::cerr << "Error: unknown argument: " << arg << std::endl;
178 return 1;
179
180
181 } // for i
182 return 0;
183}
184
185
186// Global types
187//
189typedef vector<string> string_vector;
190
191
192
194
195/// Parse the input file and extract dictionary values.
196///
197static
198int load_dict_report(const std::string& fname, string_vector& str_vec)
199{
200 bm::chrono_taker tt1(cout, "1. parse input data ", 1, &timing_map);
201
202 std::ifstream fin(fname.c_str(), std::ios::in);
203 if (!fin.good())
204 return -1;
205
206 std::string line;
207 std::regex reg("[|]");
208 std::sregex_token_iterator it_end;
209
210 string trim_chars("\" ");
211
212 for (unsigned i = 0; std::getline(fin, line); ++i)
213 {
214 if (line.empty() || !isdigit(line.front()))
215 continue;
216
217 // regex based tokenizer
218 std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
219 std::vector<std::string> line_vec(it, it_end);
220 if (line_vec.empty())
221 continue;
222 try
223 {
224 // trip the extra chars
225 string& col13 = line_vec.at(13);
226 col13.erase(0, col13.find_first_not_of(trim_chars));
227 col13.erase(col13.find_last_not_of(trim_chars) + 1);
228
229 if (!col13.empty())
230 str_vec.emplace_back(col13);
231 }
232 catch (std::exception&)
233 {
234 // just ignore (not ideal, but ok for a sketch ETL)
235 }
236 if (i % 10000 == 0)
237 {
238 cout << "\rReading input file: " << i << flush;
239 }
240 } // for
241 cout << endl;
242 return 0;
243}
244
245/// Compare STL vector with bit-transposed container to check correctness
246///
247static
248void check_sparse(const str_sparse_vect& str_sv, const string_vector& str_vec)
249{
250 if (str_vec.size() != str_sv.size())
251 throw runtime_error("Error. size() comparison failed!");
252 string s;
253 for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
254 {
255 str_sv.get(i, s);
256 const string& s_control = str_vec[i];
257 if (s != s_control)
258 {
259 cout << "idx=" << i << s << "!=" << s_control << endl;
260 throw runtime_error("Error. element comparison failed!");
261 }
262 } // for
263 std::cout << "Check ok. Dictionary size = " << str_sv.size() << std:: endl;
264}
265
266const unsigned benchmark_max = 50000; // benchmark sampling size
267
268/// Sample a few random terms out of collection
269static
270void pick_benchmark_set(string_vector& bench_vec, string_vector& bench_vec_not_found, const string_vector& str_vec)
271{
272 std::random_device rand_dev;
273 std::mt19937 gen(rand_dev()); // mersenne_twister_engine
274 std::uniform_int_distribution<unsigned> rand_dis(0, unsigned(str_vec.size()-1)); // generate uniform numebrs for [1, vector_max]
275
276 bm::bvector<> bv;
277
278 bench_vec.resize(0);
279 for (unsigned i = 0; i < benchmark_max; ++i)
280 {
281 unsigned idx;
282 while (true)
283 {
284 idx = unsigned(rand_dis(gen));
285 if (bv.test(idx)) // make sure benchmark example is not repeated
286 continue;
287 if (idx < str_vec.size())
288 bench_vec.push_back(str_vec[idx]);
289 break;
290 }
291 bv.set(idx); // mark as set
292
293 // generate not-found case by modifying a letter in an existing sample
294 {
295 string str_nf = str_vec[idx];
296 string::reverse_iterator rit = str_nf.rbegin();
297 string::reverse_iterator rit_end = str_nf.rend();
298 for (; rit != rit_end; ++rit)
299 {
300 char ch = *rit;
301 int a = rand() % 26 + int('A'); // generate random letter
302 ch = char(a);
303 *rit = ch;
304 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), str_nf);
305 if (it == str_vec.end() || *it != str_nf) // check if not found
306 {
307 bench_vec_not_found.push_back(str_nf);
308 break;
309 }
310 } // for rit
311 }
312
313 } // for
314 cout << endl;
315}
316
317static
318void run_benchmark(const str_sparse_vect& str_sv, const string_vector& str_vec)
319{
320 string_vector bench_vec;
321 string_vector bench_vec_not_found; // vector for impossible dictionary items
322
323 pick_benchmark_set(bench_vec, bench_vec_not_found, str_vec);
324
325 bm::bvector<> bv1, bv2, bv3, bv4;
326
327 cout << "Picked " << bench_vec.size() << " / "
328 << bench_vec_not_found.size() << " samples. Running benchmarks."
329 << endl;
330
331 unsigned bench_size = unsigned(bench_vec.size());
332 {
333 {
334 bm::chrono_taker tt1(cout, "3. std::lower_bound() search", bench_size, &timing_map);
335 for (const string& term : bench_vec)
336 {
337 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), term);
338 if (it != str_vec.end())
339 {
340 string_vector::size_type idx =
341 string_vector::size_type(std::distance(str_vec.begin(), it));
342 bv1.set(unsigned(idx));
343 }
344 } // for
345 }
346 {
347 bm::chrono_taker tt2(cout, "3a. std::lower_bound() search (empty)", bench_size, &timing_map);
348 for (const string& term : bench_vec_not_found)
349 {
350 auto p = std::lower_bound(str_vec.begin(), str_vec.end(), term);
351 } // for
352 }
353 }
354
355 {
356 // construct std::map<> (RB-tree)
357 std::map<string, unsigned> str_map;
358 for (string_vector::size_type i = 0; i < str_vec.size(); ++i)
359 {
360 const string& s = str_vec[i];
361 str_map[s] = unsigned(i);
362 } // for
363 {
364 bm::chrono_taker tt1(cout, "4. std::map<> search", bench_size, &timing_map);
365 for (const string& term : bench_vec)
366 {
367 auto it = str_map.find(term);
368 if (it != str_map.end())
369 {
370 bv2.set(unsigned(it->second));
371 }
372 } // for
373 }
374 {
375 bm::chrono_taker tt2(cout, "4a. std::map<> search (empty)", bench_size, &timing_map);
376 for (const string& term : bench_vec_not_found)
377 {
378 auto it = str_map.find(term);
379 if (it != str_map.end())
380 {
381 cerr << "empty search returned value..." << endl;
382 }
383 } // for
384 }
385 }
386
387 {
389 {
390 bm::chrono_taker tt1(cout, "5. bm::sparse_vector_scanner<> search", bench_size, &timing_map);
391 for (const string& term : bench_vec)
392 {
393 unsigned pos;
394 bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
395 if (found)
396 {
397 bv3.set(pos);
398 }
399 } // for
400 }
401 {
402 bm::chrono_taker tt1(cout, "5a. bm::sparse_vector_scanner<> search (empty)", bench_size, &timing_map);
403 for (const string& term : bench_vec_not_found)
404 {
405 unsigned pos;
406 bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
407 if (found)
408 {
409 cerr << "scanner empty search returned value..." << endl;
410 }
411 } // for
412 }
413 }
414
415 {
417 scanner.bind(str_sv, true); // attach SV as permanent search parameter to share cached values
418
419 {
420 bm::chrono_taker tt1(cout, "6. bm::sparse_vector_scanner<> binary search", bench_size, &timing_map);
421 for (const string& term : bench_vec)
422 {
423 unsigned pos;
424 bool found = scanner.bfind_eq_str(term.c_str(), pos);
425 if (found)
426 {
427 bv4.set(pos);
428 }
429 } // for
430 }
431 {
432 bm::chrono_taker tt2(cout, "6a. bm::sparse_vector_scanner<> binary search (empty)", bench_size, &timing_map);
433 for (const string& term : bench_vec_not_found)
434 {
435 unsigned pos;
436 bool found = scanner.bfind_eq_str(term.c_str(), pos);
437 if (found)
438 {
439 cerr << "scanner empty search returned value..." << endl;
440 }
441 } // for
442 }
443 }
444
445 // various integrity checks
446 //
447 int cmp = bv1.compare(bv2);
448 if (cmp != 0)
449 throw runtime_error("Error. RB-search mismatch!");
450 cmp = bv1.compare(bv3);
451 if (cmp != 0)
452 throw runtime_error("Error. scanner mismatch!");
453
454 cmp = bv1.compare(bv4);
455 if (cmp != 0)
456 throw runtime_error("Error. binary scanner mismatch!");
457
458 if (bv1.count() != bench_size)
459 throw runtime_error("Error. Search result missing elements!");
460
461
462}
463
464
465int main(int argc, char *argv[])
466{
467 if (argc < 3)
468 {
469 show_help();
470 return 1;
471 }
472
473 string_vector str_vec; // dictionary vector (STL)
474 str_sparse_vect str_sv; // bit-transposed sparse vector
475
476 try
477 {
478 auto ret = parse_args(argc, argv);
479 if (ret != 0)
480 {
481 return ret;
482 }
483 if (!i_name.empty())
484 {
485 auto res = load_dict_report(i_name, str_vec);
486 if (res != 0)
487 {
488 return res;
489 }
490 cout << "Loaded " << str_vec.size() << " dictionary names." << endl;
491
492 std::sort(str_vec.begin(), str_vec.end());
493 }
494
495 if (str_vec.size()) // load the sparse vector
496 {
497 bm::chrono_taker tt1(cout, "2. build sparse vector", 1, &timing_map);
498 {
499 // use insert iterator to load vector (faster than push-back)
500 //
501 auto bi = str_sv.get_back_inserter();
502 for (const string& term : str_vec)
503 bi = term;
504 bi.flush();
505 }
506
507 // build remapped (dense) succinct vector
508 // (this should be final), no more edits in this form
509 if (is_remap)
510 {
511 str_sparse_vect str_sv_remap;
512 str_sv_remap.remap_from(str_sv);
513 str_sv.swap(str_sv_remap);
514 }
515
517 str_sv.optimize(tb); // memory optimization after load
518 }
519
520 if (!sv_in_name.empty())
521 {
522 {
523 bm::chrono_taker tt1(cout, "8. Load sparse vector", 1, &timing_map);
524 file_load_svector(str_sv, sv_in_name);
525 }
526 if (str_sv.empty())
527 {
528 cout << "Input vector empty!" << endl;
529 exit(1);
530 }
531 if (str_vec.empty())
532 {
533 for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
534 {
535 string s;
536 str_sv.get(i, s);
537 str_vec.emplace_back(std::move(s));
538 } // for
539 }
540 }
541
542 // save SV vector to file
543 if (!sv_out_name.empty() && !str_sv.empty())
544 {
545 bm::chrono_taker tt1(cout, "7. Save sparse vector", 1, &timing_map);
546 file_save_svector(str_sv, sv_out_name, 0, is_xor);
547
548 str_sparse_vect str_sv_control;
549 file_load_svector(str_sv_control, sv_out_name);
550 bool eq = str_sv.equal(str_sv_control);
551 if (!eq)
552 {
553 cerr << "Serialization control failed" << endl;
554 assert(0); exit(1);
555 }
556 }
557
558
559
560 if (is_diag)
561 {
562 if (!str_sv.empty())
563 {
564 print_svector_stat(cout,str_sv, false);
565 }
566
567 if (str_vec.size())
568 {
569 size_t total_size = 0;
570 for (const string& term : str_vec)
571 {
572 total_size += term.size();
573 }
574 cout << "String dictionary size: "
575 << total_size / 1024 << "KB (" << total_size / (1024*1024) << "MB)"
576 << endl;
577 }
578
579 if (str_sv.size() && str_vec.size())
580 {
581 cout << "Run full comparison check..." << endl;
582 check_sparse(str_sv, str_vec); // run a paranoiya check
583 cout << "Ok" << endl;
584 }
585 }
586
587 if (is_bench) // run set of benchmarks
588 {
589 run_benchmark(str_sv, str_vec);
590 }
591
592 if (is_timing) // print all collected timings
593 {
594 std::cout << std::endl << "Performance:" << std::endl;
596 }
597 }
598 catch (std::exception& ex)
599 {
600 std::cerr << "Error:" << ex.what() << std::endl;
601 return 1;
602 }
603
604 return 0;
605}
606
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Serialization for sparse_vector<>
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition: bm.h:1480
size_type size() const BMNOEXCEPT
Returns bvector's capacity (number of bits it can store)
Definition: bm.h:1278
size_type count() const BMNOEXCEPT
population count (count of ON bits)
Definition: bm.h:2366
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition: bm.h:4153
void resize(size_type new_size)
Change size of the bvector.
Definition: bm.h:2428
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition: bm.h:3709
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
algorithms for sparse_vector scan/search
bool bfind_eq_str(const SV &sv, const value_type *str, size_type &pos)
binary find first sparse vector element (string) Sparse vector must be sorted.
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
void flush()
flush the accumulated buffer.
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void swap(str_sparse_vector &str_sv) BMNOEXCEPT
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
bool empty() const
return true if vector is empty
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void remap_from(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix=0)
Build remapping profile and load content from another sparse vector Remapped vector likely saves memo...
size_type size() const
return size of the vector
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
bvector_type::size_type size_type
std::random_device rand_dev
Definition: sample11.cpp:51
std::uniform_int_distribution rand_dis(1, int(vector_max))
std::mt19937 gen(rand_dev())
int main(int argc, char *argv[])
Definition: xsample05.cpp:465
static int parse_args(int argc, char *argv[])
Definition: xsample05.cpp:98
bool is_bench
Definition: xsample05.cpp:92
std::string sv_in_name
Definition: xsample05.cpp:88
static void show_help()
Print help.
Definition: xsample05.cpp:69
vector< string > string_vector
Definition: xsample05.cpp:189
bm::chrono_taker ::duration_map_type timing_map
Definition: xsample05.cpp:193
bool is_diag
Definition: xsample05.cpp:90
static int load_dict_report(const std::string &fname, string_vector &str_vec)
Parse the input file and extract dictionary values.
Definition: xsample05.cpp:198
std::string sv_out_name
Definition: xsample05.cpp:87
static void run_benchmark(const str_sparse_vect &str_sv, const string_vector &str_vec)
Definition: xsample05.cpp:318
static void pick_benchmark_set(string_vector &bench_vec, string_vector &bench_vec_not_found, const string_vector &str_vec)
Sample a few random terms out of collection.
Definition: xsample05.cpp:270
static void check_sparse(const str_sparse_vect &str_sv, const string_vector &str_vec)
Compare STL vector with bit-transposed container to check correctness.
Definition: xsample05.cpp:248
bool is_xor
Definition: xsample05.cpp:94
std::string i_name
Definition: xsample05.cpp:89
const unsigned benchmark_max
Definition: xsample05.cpp:266
bool is_remap
Definition: xsample05.cpp:93
bool is_timing
Definition: xsample05.cpp:91
bm::str_sparse_vector< char, bm::bvector<>, 64 > str_sparse_vect
Definition: xsample05.cpp:188