BitMagic-C++
strsvsample07.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2021 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example strsvsample07.cpp
20 Succinct container scanner search using pipeline to run thousands of searches faster
21 one by one scans. scanner::pipeline uses variuous cache and algorithmic optimization techniques
22 to run bulk searches faster.
23
24 \sa bm::str_sparse_vector
25 \sa bm::sparse_vector_scanner
26 \sa bm::sparse_vector_scanner::pipeline
27*/
28
29/*! \file strsvsample07.cpp
30 \brief Example: Succinct container for strings, bulk search using scanner pipeline
31*/
32
33#include <iostream>
34#include <string>
35#include <vector>
36#include <memory>
37#include <cassert>
38#include <thread>
39
40#include "bm.h"
41#include "bmstrsparsevec.h"
42#include "bmsparsevec_algo.h"
43
44#include "bmtimer.h"
45#include "bmdbg.h"
46
47#include "bmundef.h" /* clear the pre-proc defines from BM */
48
49using namespace std;
50
53
54/**
55
56 Test data generation.
57 max_coll - defines the number of string variants
58 repeat_factor - how often strings should be duplicated (to simulate the compressable collections),
59 higher repeat_factor produces more compressable vector.
60 */
61static
62void GenerateTestData(std::vector<string>& str_coll,
63 str_sv_type& str_sv,
64 unsigned max_coll = 8000000,
65 unsigned repeat_factor=10)
66{
67 // use back inserter to fill in succinct vector (it is faster than push_back)
68 auto bi(str_sv.get_back_inserter());
69
70 string str;
71 for (unsigned i = 10; i < max_coll; i+= (rand()&0xF))
72 {
73 switch (i & 0xF)
74 {
75 case 0: str = "AB"; break;
76 case 1: str = "GTx"; break;
77 case 2: str = "cnv"; break;
78 default: str = "AbY11"; break;
79 }
80 str.append(to_string(i));
81
82 for (unsigned k = 0; k < repeat_factor; ++k)
83 {
84 str_coll.emplace_back(str);
85 bi = str; // feed into SV back-inserter
86 }
87 } // for i
88 bi.flush();
89}
90
91bool is_diag = true; ///< Flag to print the SV diagnostics
92
93/// Rudimentary cmd-args parser
94static
95void parse_args(int argc, char *argv[])
96{
97 for (int i = 1; i < argc; ++i)
98 {
99 std::string arg = argv[i];
100 if (arg == "-nodiag")
101 {
102 is_diag = false;
103 continue;
104 }
105 } // for i
106}
107
108
109
110
111int main(int argc, char *argv[])
112{
113 try
114 {
115 parse_args(argc, argv);
116
117
118 std::vector<string> str_coll;
119 str_sv_type str_sv0(bm::use_null); // sparse-succinct vector
120
121 cout << "Generating the test data... " << flush;
122 GenerateTestData(str_coll, str_sv0);
123 str_sv_type str_sv1(str_sv0); // make a copy of the original vector
124
125 cout << "OK" << endl;
126
127
128 {
129 cout << "Remapping the data to create compressed vector " << flush;
131 // apply char frequency remapping compression
132 // (should not modify after that)
133 str_sv0.remap();
134 str_sv0.optimize(tb); // optimize the succinct vector
135
136 cout << "OK" << endl;
137 }
138
139 // we have two succinct vectors str_sv0 (remapped and optimized)
140 // and str_sv1 - original after construction
141 // - print statistics to take a look into details
142 //
143
144 if (is_diag)
145 {
146 cout << "\nStatistics on generated SV:" << endl;
147 bm::print_svector_stat(cout, str_sv1);
148 // diagnostics print to see the details of succinct structures
149 cout << "\nStatistics on remapped/optimized SV:" << endl;
150 bm::print_svector_stat(cout, str_sv0);
151 cout << endl << endl;
152 }
153
154 // create a random sampling of strings to search
155 //
156 unsigned test_runs = 10000;
157 std::vector<string> str_test_coll;
158 for (bvector_type::size_type i = 0; i < test_runs; ++i)
159 {
160 bvector_type::size_type idx = (unsigned) rand() % test_runs;
161 if (idx >= test_runs)
162 idx = test_runs/2;
163 str_test_coll.push_back(str_coll[idx]);
164 }
165 assert(str_test_coll.size() == test_runs);
166
167 // -------------------------------------------------------------
168
169 std::vector<unique_ptr<bvector_type> > res_vec1;
171
172
173 cout << "Running benchmark tests.." << endl;
174
175 for (int pass = 0; pass < 2; pass++)
176 {
177 cout << "PASS = " << pass << ((pass==0) ? " -- remap/optimized" : " -- NOT remapped") << endl;
178
179 res_vec1.resize(0);
180 const str_sv_type* str_sv = (pass==0) ? &str_sv0 : &str_sv1;
181
182 // Run experiment 1, search sparse vector in a loop one-by-one
183 // This is not a slow method, scanner uses various optimizations
184 // (SIMD, "search space prunning" to be efficient)
185 {
186 bm::chrono_taker tt(cout, "scanner<>::find_eq_str()", test_runs);
187
188 for (bvector_type::size_type i = 0; i < test_runs; ++i)
189 {
190 const string& str = str_test_coll[i];
191 unique_ptr<bvector_type> bv_res(new bvector_type);
192 scanner.find_eq_str(*str_sv, str.c_str(), *bv_res);
193 res_vec1.emplace_back(unique_ptr<bvector_type>(bv_res.release()));
194 } // for
195 }
196
197 // There is a faster way to do the same.
198 // if we know a bulk of our searches upfront we can use pipeline
199 // schedule to run the search group.
200 // Scanner pipeline will anayse the set and try to build a more
201 // optimal search plan, taking into account CPU cache optimization,
202 // resuse of compressed bit-blocks (inetrnal details) and other
203 // factors between the search data set items and the sparce vector
204 //
205
206 // pipeline object to run the bulk search
207 //
209
210 // batch_size instructs how many search to run at once
211 // batch_size=0 and this parameter will be identified automatically
212 //
213 // batch size essentially depends on CPU cache size and it is
214 // sometimes difficult to determine without trying.
215 // batch_size=0 will try to use euristics for CPU L2 = 256KB,
216 // it may be "good enough", but for best results it is best
217 // to run trial runs with typical values may be (2 to 20)
218 //
219 pipe1.options().batch_size = test_runs;
220 {
221 bm::chrono_taker tt(cout, "scanner::pipeline find_eq_str()", test_runs);
222
223 // add all the search items to the pipeline
224 for (size_t i = 0; i < test_runs; ++i)
225 {
226 const string& str = str_test_coll[i];
227 pipe1.add(str.c_str());
228 }
229 pipe1.complete(); // finish the pipeline construction with this call
230
231 scanner.find_eq_str(pipe1); // run the search
232
233 // at this point we have the results (in the pipeline object itself)
234 }
235
236 // This example sets the search range for only a part of
237 // the sv vector using the mask bit-vector.
238 // Since mask vector narrows down the search space - it is faster
239 //
240 bm::bvector<> bv_mask;
241 bv_mask.set_range(0, str_sv->size()/3); // only first 1/3 of elements will be searched
242
243
244 // scanner is configured to support result vectors AND search masking
245 //
246 typedef bm::agg_run_options<true, false, true> scanner_custom_mask_opt;
247 bm::sparse_vector_scanner<str_sv_type>::pipeline<scanner_custom_mask_opt> pipe1_and(*str_sv);
248
249 pipe1_and.set_search_mask(&bv_mask); // associate search mask with the pipeline
250 pipe1_and.options().batch_size = test_runs;
251
252 {
253 bm::chrono_taker tt(cout, "scanner::pipeline+MASK find_eq_str()", test_runs);
254
255 // add all the search items to the pipeline
256 for (size_t i = 0; i < test_runs; ++i)
257 {
258 const string& str = str_test_coll[i];
259 pipe1_and.add(str.c_str()); // this will search within defined mask
260 }
261 pipe1_and.complete();
262 scanner.find_eq_str(pipe1_and); // run the search
263 }
264
265
266 // This variant would build the same pipeline but configure
267 // it differently using pipe2.options()
268 //
269 // The idea here is that sometimes we don't actually need the result
270 // vectors, but only need population count from this
271 // pipeline can be configured to do that without actually
272 // materializing result bit-vectors
273 //
274
275 // pipeline configuration passed via template parameter
276 // instructs to drop results and provide only counts
277
279 pipe1.options().batch_size = test_runs;
280
281 {
282 bm::chrono_taker tt(cout, "scanner::pipeline find_eq_str()-count()", test_runs);
283
284 for (size_t i = 0; i < test_runs; ++i)
285 {
286 const string& str = str_test_coll[i];
287 pipe2.add(str.c_str());
288 }
289 pipe2.complete(); // finish the pipeline construction with this call
290
291 scanner.find_eq_str(pipe2); // run the search pipeline
292
293 // at this point we have the population count
294 // ... see below how to use it
295 }
296
297 // results from two pileline runs are ready at this point
298 // we now get access to it
299 //
300
301 bvector_type bv_or_total;
302
303 {
304 // please note that returned buffer results vectors are NOT STL
305 // vector<> type, but a thin wrapper over C style objects
306 // (for the reasons of portability)
307 //
308 auto& res_vect = pipe1.get_bv_res_vector(); // results from pipeline 1
309 auto& res_vect_and = pipe1_and.get_bv_res_vector();
310 auto& cnt_vect = pipe2.get_bv_count_vector(); // counts from pileine 2
311
312 assert(res_vect.size() == cnt_vect.size());
313
314 // iterate over results, run some checks...
315 size_t res_sz = res_vect.size();
316 for (size_t i = 0; i < res_sz; ++i)
317 {
318 const bvector_type* bv1 = res_vec1[i].get();
319 assert(bv1);
320 const bvector_type* bv = res_vect[i];
321 assert(bv);
322 bool match = bv1->equal(*bv); // quick check
323 assert(match);
324
325 auto c = cnt_vect[i];
326 auto cnt = bv->count();
327 (void)cnt; (void)c; // to silence unused warnings (relese)
328 assert(cnt == c); // check if counts match
329
330 bv_or_total |= *bv; // accumulate OR
331 {
332 auto c_and = bm::count_and(*bv, bv_mask);
333 const bvector_type* bv_and = res_vect_and[i];
334 if (bv_and)
335 {
336 auto c1 = bv_and->count();
337 assert(c1 == c_and); (void)c1; (void)c_and;
338 bvector_type bv_m;
339 bv_m.bit_and(*bv1, bv_mask);
340 match = bv_and->equal(bv_m);
341 assert(match);
342 }
343 else
344 {
345 assert(!c_and);
346 }
347
348
349 }
350 }
351 }
352
353 // Here we define a csutom pipeline run policy which disables
354 // both intermediate results and population counting for it...
355 //
356 // instead it aggregates the results into one UNION ALL (OR) vector
357 // which simulates a huge
358 // field1 IN ('value1', 'value2', 'value3', .... ) SQL expression
359 //
360
361 typedef bm::agg_run_options<false, false> scanner_custom_opt;
362 bm::sparse_vector_scanner<str_sv_type>::pipeline<scanner_custom_opt> pipe3(*str_sv);
363 pipe1.options().batch_size = test_runs;
364
365 bvector_type bv_or;
366 pipe3.set_or_target(&bv_or); // Assign OR aggregation target
367
368 {
369 bm::chrono_taker tt(cout, "scanner::pipeline find_eq_str()-OR()", test_runs);
370
371 for (size_t i = 0; i < test_runs; ++i)
372 {
373 const string& str = str_test_coll[i];
374 pipe3.add(str.c_str());
375 }
376 pipe3.complete(); // finish the pipeline construction with this call
377
378 scanner.find_eq_str(pipe3); // run the search pipeline
379 }
380 bool match = bv_or.equal(bv_or_total);
381 if (!match)
382 {
383 cerr << "OR vector mismatch!" << endl;
384 exit(1);
385 }
386
387
388
389
390 cout << endl;
391
392 // second pass will run the same benchmarks, only using original
393 // non-remapped vector to see the effects of additional compression
394 // on performance of scanner searches
395
396 } // for pass
397
398
399 }
400 catch(std::exception& ex)
401 {
402 std::cerr << ex.what() << std::endl;
403 return 1;
404 }
405
406
407 return 0;
408}
409
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Algorithms for bm::sparse_vector.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
bool equal(const bvector< Alloc > &bvect) const BMNOEXCEPT
Equal comparison with an agr bit-vector.
Definition: bm.h:1995
size_type count() const BMNOEXCEPT
population count (count of ON bits)
Definition: bm.h:2366
bm::bvector< Alloc > & bit_and(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand AND : this := bv1 AND bv2
Definition: bm.h:5880
bvector_size_type size_type
Definition: bm.h:121
bvector< Alloc > & set_range(size_type left, size_type right, bool value=true)
Sets all bits in the specified closed interval [left,right] Interval must be inside the bvector's siz...
Definition: bm.h:2333
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
Pipeline to run multiple searches against a particular SV faster using cache optimizations.
void complete()
Prepare pipeline for the execution (resize and init internal structures) Once complete,...
bvect_vector_type & get_bv_res_vector() BMNOEXCEPT
Return results vector used for pipeline execution.
void add(const typename SV::value_type *str)
Add new search string.
run_options & options() BMNOEXCEPT
Set pipeline run options.
algorithms for sparse_vector scan/search
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
size_type size() const
return size of the vector
void remap()
Build remapping profile and re-load content to save memory.
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:229
BV::size_type count_and(const BV &bv1, const BV &bv2) BMNOEXCEPT
Computes bitcount of AND operation of two bitsets.
Definition: bmalgo.h:49
int main(int argc, char *argv[])
static void parse_args(int argc, char *argv[])
Rudimentary cmd-args parser.
bm::bvector bvector_type
static void GenerateTestData(std::vector< string > &str_coll, str_sv_type &str_sv, unsigned max_coll=8000000, unsigned repeat_factor=10)
Test data generation.
bm::str_sparse_vector< char, bvector_type, 8 > str_sv_type
bool is_diag
Flag to print the SV diagnostics.
Aggregation options to control execution Default settings are to support only result bit-vector filte...
Definition: bmaggregator.h:66