BitMagic-C++
xsample09.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2020 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample09.cpp
20
21 Then it computes variants of density histograms of using
22 presense/absense (NOT NULL bit-vector) and bm::bvector<>::count_range()
23 function.
24
25 @sa bm::bvector
26 @sa bm::bvector::find_reverse
27 @sa bm::bvector::find
28
29 @sa bm::rank_range_split
30
31 @sa bm::sparse_vector
32 @sa bm::rsc_sparse_vector
33 @sa bm::rsc_sparse_vector::get_back_inserter
34*/
35
36/*! \file xsample09.cpp
37 \brief Example: Use succinct vectors for histogram construction
38*/
39
40
41#include <iostream>
42#include <memory>
43#include <vector>
44#include <random>
45#include <algorithm>
46#include <stdexcept>
47
48using namespace std;
49
50#include "bm.h"
51#include "bmtimer.h"
52#include "bmsparsevec.h"
53#include "bmsparsevec_compr.h"
54
55#include "bmdbg.h"
56
57#include "bmundef.h" /* clear the pre-proc defines from BM */
58
59// ----------------------------------------------------
60// Global parameters and types
61// ----------------------------------------------------
62
63const unsigned test_size = 250000000; // number of events (ints) to generate
64const unsigned sampling_interval = 2500; // size of the histogram sampling
65
70
71typedef std::vector<std::pair<bv_size_type, bv_size_type> > bv_ranges_vector;
72
73
74
75// timing storage for benchmarking
77
78
79/// Generate a test RSC vector with a randomly distributed values
80/// imitating distribution density of genome variations
81/// it adds a huge area of empty in the middle to simulate chr centromere
82static
84{
86 unsigned cnt = 0;
87
88 // part one: dense
89 {
90 auto bit = csv.get_back_inserter(); // fastest way to add the data
91 for (unsigned i = 0; i < size/3; )
92 {
93 bit = cnt++;
94 unsigned nc = (unsigned)(rand() % 16);
95 bit.add_null(nc);
96
97 i+=nc+1;
98 } // for
99 bit.flush();
100 }
101 // part two: empty + dense again
102 {
103 auto bit = csv.get_back_inserter(); // fastest way to add the data
104 bit.add_null(size/3); // add huge emty space in the middle
105
106 for (unsigned i = 0; i < size; )
107 {
108 bit = cnt++;
109 unsigned nc = (unsigned)(rand() % 16);
110 bit.add_null(nc);
111
112 i+=nc+1;
113 } // for
114 bit.flush();
115 }
116
117 csv.optimize(tb); // memory compression
118 csv.sync(); // construct the Rank-Select access index
119}
120
121/// generate list of random indexes (locations) to read histogram values
122///
123static
124void generate_access_samples(std::vector<bvector_type::size_type> &sample_vec,
125 unsigned size)
126{
127 for (unsigned i = 0; i < size; )
128 {
129 unsigned sample = (unsigned)(rand() % 256); // random
130 sample_vec.push_back(sample);
131 i += sample;
132 }
133 std::random_device rd;
134 std::mt19937 g(rd());
135
136 // even more random (unordered)
137 std::shuffle(sample_vec.begin(), sample_vec.end(), g);
138}
139
140/// Compute histogram as a SV vector using fixed sampling interval
141///
142static
144 const rsc_sparse_vector_u32& csv,
145 sparse_vector_u32::size_type sampling_size)
146{
147 assert(sampling_size);
148 assert(sampling_size < csv.size());
149
151
153 sparse_vector_u32::size_type to = sampling_size-1;
154
155 // get NOT NULL vector
157 assert(bv_null);
158
159 {
160 auto bit = hist_sv.get_back_inserter();
161 auto sz = csv.size();
162 do
163 {
164 auto cnt = bv_null->count_range(from, to); // closed interval [from..to]
165 bit = cnt;
166 from += sampling_size;
167 to = from + sampling_size - 1;
168 } while (from < sz);
169 bit.flush();
170 }
171 hist_sv.optimize(tb); // memory compaction
172}
173
174/// Compute histogram as a RSC vector using fixed sampling interval.
175/// Histogram values are stored as "true" interval start coordinates and
176/// it is a more flexible scheme if we eventually decide to use adaptive
177/// sampling (variable step).
178///
179static
181 const rsc_sparse_vector_u32& csv,
182 sparse_vector_u32::size_type sampling_size)
183{
184 assert(sampling_size);
185 assert(sampling_size < csv.size());
186
187 {
188 // important! bm::use_null if vector is later converted to RSC
190
192 rsc_sparse_vector_u32::size_type to = sampling_size-1;
193
195 assert(bv_null);
196
197 auto sz = csv.size();
198 do
199 {
200 auto cnt = bv_null->count_range(from, to); // closed interval [from..to]
201 hist_sv.set(from, cnt); // assign historgam value at the interval start
202
203 from += sampling_size;
204 to = from + sampling_size - 1;
205 } while (from < sz);
206
207 hist_rsc.load_from(hist_sv);
208 }
209
211 hist_rsc.optimize(tb);
212 hist_rsc.sync();
213}
214
215/// Adaptive histogram identifies number of not NULL elements (events)
216/// and varies the size of the histogram bin trying to make sure all
217/// bins (but last) are the same weight
218///
219static
221 const rsc_sparse_vector_u32& csv,
222 sparse_vector_u32::size_type sampling_size)
223{
224 assert(sampling_size);
226
227 const bvector_type* bv_null = csv.get_null_bvector();
228 bv_size_type split_rank = sampling_size;
229
230 bv_ranges_vector pair_vect;
231 bm::rank_range_split(*bv_null, split_rank, pair_vect);
232
233 size_t sz = pair_vect.size();
234 for (size_t k = 0; k < sz; ++k)
235 {
236 const auto& p = pair_vect[k]; // [from..to] sampling rank interval
237 hist_rsc.push_back(p.first, 0); // split_rank);
238 } // for
239 hist_rsc.optimize(tb);
240 hist_rsc.sync();
241}
242
243/// Some test to confirm correctness
244///
245static
247 const sparse_vector_u32& hist_sv,
248 sparse_vector_u32::size_type sampling_size)
249{
250 const rsc_sparse_vector_u32::bvector_type* bv_null = hist_rsc.get_null_bvector();
251 auto en = bv_null->get_enumerator(0);
252 for (;en.valid(); ++en)
253 {
254 auto idx = *en;
255 rsc_sparse_vector_u32::size_type sv_idx = (idx / sampling_size);
256 auto v1 = hist_rsc[idx];
257 auto v2 = hist_sv[sv_idx];
258 if (v1 != v2)
259 {
260 cerr << "Discrepancy at:" << idx << endl;
261 exit(1);
262 }
263 } // for
264}
265
266
267/// Access benchmark 1
268///
269/// uses regular bit-transposed sparse vector to read
270/// histogram values in random order. It relies on fixed inetrval sampling.
271///
272static
273unsigned long long access_bench1(const sparse_vector_u32& hist_sv,
274 const std::vector<bvector_type::size_type>& sample_vec,
275 unsigned sampling_size)
276{
277 unsigned long long sum = 0;
278 for (size_t i = 0; i < sample_vec.size(); ++i)
279 {
280 auto idx = sample_vec[i];
281 idx = idx / sampling_size; // interval is calculated by division
282 auto v = hist_sv[idx];
283 sum += v;
284 }
285 return sum;
286}
287
288/// Access benchmark 2
289///
290/// Uses Rank-Select bit-transposed vector to read histogram values
291/// Sampling interval can be non-fixed (variadic, adaptive sampling).
292/// Method finds the interval start and value using RSC container not NULL vector
293static
294unsigned long long access_bench2(const rsc_sparse_vector_u32& hist_rsc,
295 const std::vector<bvector_type::size_type>& sample_vec)
296{
297 const bvector_type* bv_null = hist_rsc.get_null_bvector();
298 assert(bv_null);
299
300 unsigned long long sum = 0;
301 for (size_t i = 0; i < sample_vec.size(); ++i)
302 {
303 auto idx = sample_vec[i];
304
305 // search back into container not NULL bit-vector to find sampling
306 // interval start position
308 bool found = bv_null->find_reverse(idx, pos);
309 assert(found); (void)found;
310
311 auto v = hist_rsc[pos];
312 sum += v;
313 }
314 return sum;
315}
316
317
318static
320 const std::vector<bvector_type::size_type>& sample_vec,
321 const rsc_sparse_vector_u32& rsc_data)
322{
324 // find the last element in the data-set
325 {
326 const bvector_type* bv_null = rsc_data.get_null_bvector();
327 bool found = bv_null->find_reverse(last);
328 assert(found); (void)found;
329 }
330
331 const bvector_type* bv_null = hist_rsc.get_null_bvector();
332 assert(bv_null);
333
334 for (size_t i = 0; i < sample_vec.size(); ++i)
335 {
336 auto idx = sample_vec[i];
337
338 // search back into container not NULL bit-vector to find sampling
339 // interval start position
340 bvector_type::size_type pos_start, pos_end;
341 bool found = bv_null->find_reverse(idx, pos_start);
342 assert(found);
343
344 found = bv_null->find(idx+1, pos_end);
345 if (!found)
346 pos_end = last;
347
348 // please note that we don't need number of events here
349 // (as it is always the same) we need the interval [start......end]
350 //
351 // (end - start) / sampling_rank defines the average density
352 //
353 }
354}
355
356
357int main(void)
358{
359 try
360 {
361 rsc_sparse_vector_u32 rsc_test;
362 bv_size_type data_set_size = 0;
363
364 sparse_vector_u32 hist1;
365 unsigned hist1_avg = 0;
366
369
370 std::vector<bvector_type::size_type> svec;
371
372 generate_test_data(rsc_test, test_size);
373
374 {
375 const bvector_type* bv_null = rsc_test.get_null_bvector();
376 assert(bv_null);
377 data_set_size = bv_null->count(); // number of elements in the data set
378 cout << "Data set size: " << rsc_test.size() << endl;
379 cout << "Number of elements/events in the data set: " << data_set_size << endl;
380 }
381
382 {
383 bm::chrono_taker tt1(cout, "01. Histogram 1 construction (SV)) ", 1, &timing_map);
384 compute_historgam(hist1, rsc_test, sampling_interval);
385 }
386 // explore some statistics on SV histogram 1
387 {
388 sparse_vector_u32::statistics st;
389 hist1.calc_stat(&st);
390
391 cout << "Histogram 1 (SV)" << endl;
392 cout << " size: " << hist1.size() << endl;
393 cout << " RAM size: " << st.memory_used << endl;
394 {
395 bm::chrono_taker tt1(cout, "05. Serialize and save the histogram 1 (SV)", 1, &timing_map);
396 size_t serialized_size = 0;
397 int res = bm::file_save_svector(hist1, "hist1.sv", &serialized_size, true);
398 if (res!=0)
399 cerr << "Failed to save!" << endl;
400 else
401 cout << " file size: " << serialized_size << endl;
402 }
403
404 // calculate sum and average value in historgam 1
405 //
406
407 unsigned h1_sum = 0;
408 auto it = hist1.begin();
409 auto it_end = hist1.end();
410 for (; it != it_end; ++it)
411 h1_sum += *it;
412 assert (h1_sum == data_set_size);
413 hist1_avg = h1_sum / hist1.size();
414 }
415
416 {
417 bm::chrono_taker tt1(cout, "02. Histogram 2 construction (RSC) ", 1, &timing_map);
419 }
420 {
421 rsc_sparse_vector_u32::statistics st;
422 hist2.calc_stat(&st);
423
424 cout << "Histogram 2 (RSC)" << endl;
425 cout << " size: " << hist2.size() << endl;
426 cout << " NOT NULL count: " << hist2.get_null_bvector()->count() << endl;
427 cout << " RAM size: " << st.memory_used << endl;
428
429 {
430 bm::chrono_taker tt1(cout, "06. Serialize and save histogram 2(RSC)", 1, &timing_map);
431 size_t serialized_size = 0;
432 int res = bm::file_save_svector(hist2, "hist2.sv", &serialized_size, true);
433 if (res!=0)
434 cerr << "Failed to save!" << endl;
435 else
436 cout << " file size: " << serialized_size << endl;
437 }
438 }
439
440 {
441 bm::chrono_taker tt1(cout, "03. Histogram 3 (adaptive) construction (RSC) ", 1, &timing_map);
442 compute_adaptive_rsc_histogram(hist3, rsc_test, hist1_avg);
443 }
444
445 {
446 rsc_sparse_vector_u32::statistics st;
447 hist3.calc_stat(&st);
448
449 cout << "Histogram 3 adaptive (RSC)" << endl;
450 cout << " size: " << hist3.size() << endl;
451 cout << " NOT NULL count: " << hist3.get_null_bvector()->count() << endl;
452 cout << " RAM size: " << st.memory_used << endl;
453 {
454 bm::chrono_taker tt1(cout, "07. Serialize and save the adaptive histogram 3 (RSC)", 1, &timing_map);
455 size_t serialized_size = 0;
456 int res = bm::file_save_svector(hist3, "hist3.sv", &serialized_size, true);
457 if (res!=0)
458 cerr << "Failed to save!" << endl;
459 else
460 cout << " file size: " << serialized_size << endl;
461 }
462 }
463
465 cout << endl;
466 cout << "Access sample size = " << svec.size() << endl;
467
468 {
469 bm::chrono_taker tt1(cout, "04. Verification", 1, &timing_map);
471 }
472
473
474 unsigned long long sum1(0), sum2(0);
475 {
476 bm::chrono_taker tt1(cout, "08. Random access test H1 (SV)", 1, &timing_map);
477 sum1 = access_bench1(hist1, svec, sampling_interval);
478 }
479
480 {
481 bm::chrono_taker tt1(cout, "09. Random access test H2 (RSC)", 1, &timing_map);
482 sum2 = access_bench2(hist2, svec);
483 }
484
485 if (sum1 != sum2) // paranoiya check
486 {
487 cerr << "Control sum discrepancy!" << endl;
488 return 1;
489 }
490
491 {
492 bm::chrono_taker tt1(cout, "10. Random access test H3 adaptive (RSC)", 1, &timing_map);
493 access_bench3(hist3, svec, rsc_test);
494 }
495
496 cout << endl;
498
499 }
500 catch(std::exception& ex)
501 {
502 std::cerr << ex.what() << std::endl;
503 return 1;
504 }
505
506 return 0;
507}
508
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
bool find(size_type &pos) const BMNOEXCEPT
Finds index of first 1 bit.
Definition: bm.h:4855
size_type count() const BMNOEXCEPT
population count (count of ON bits)
Definition: bm.h:2366
bvector_size_type size_type
Definition: bm.h:121
bool find_reverse(size_type &pos) const BMNOEXCEPT
Finds last index of 1 bit.
Definition: bm.h:4673
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void push_back(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
back_insert_iterator get_back_inserter()
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
void sync(bool force)
Re-calculate rank-select index for faster access to vector.
void calc_stat(struct rsc_sparse_vector< Val, SV >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL)
size_type size() const BMNOEXCEPT
return size of the vector
succinct sparse vector with runtime compression using bit-slicing / transposition method
Definition: bmsparsevec.h:87
size_type size() const BMNOEXCEPT
return size of the vector
Definition: bmsparsevec.h:711
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
Definition: bmsparsevec.h:1804
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
Definition: bmsparsevec.h:559
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
Definition: bmsparsevec.h:572
void calc_stat(struct sparse_vector< Val, BV >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
Definition: bmsparsevec.h:2078
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
Definition: bmsparsevec.h:2256
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
Definition: bmsparsevec.h:2094
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:229
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
Definition: bmalgo.h:394
bm::bvector ::size_type bv_size_type
Definition: sample24.cpp:54
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
Definition: sample24.cpp:55
const unsigned test_size
Definition: xsample09.cpp:63
bvector_type::size_type bv_size_type
Definition: xsample09.cpp:67
static void verify_histograms(const rsc_sparse_vector_u32 &hist_rsc, const sparse_vector_u32 &hist_sv, sparse_vector_u32::size_type sampling_size)
Some test to confirm correctness.
Definition: xsample09.cpp:246
static void compute_adaptive_rsc_histogram(rsc_sparse_vector_u32 &hist_rsc, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Adaptive histogram identifies number of not NULL elements (events) and varies the size of the histogr...
Definition: xsample09.cpp:220
bm::bvector bvector_type
Definition: xsample09.cpp:66
const unsigned sampling_interval
Definition: xsample09.cpp:64
static void compute_rsc_historgam(rsc_sparse_vector_u32 &hist_rsc, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Compute histogram as a RSC vector using fixed sampling interval.
Definition: xsample09.cpp:180
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
Definition: xsample09.cpp:71
bm::sparse_vector< unsigned, bvector_type > sparse_vector_u32
Definition: xsample09.cpp:68
int main(void)
Definition: xsample09.cpp:357
bm::chrono_taker ::duration_map_type timing_map
Definition: xsample09.cpp:76
static void compute_historgam(sparse_vector_u32 &hist_sv, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Compute histogram as a SV vector using fixed sampling interval.
Definition: xsample09.cpp:143
static void access_bench3(const rsc_sparse_vector_u32 &hist_rsc, const std::vector< bvector_type::size_type > &sample_vec, const rsc_sparse_vector_u32 &rsc_data)
Definition: xsample09.cpp:319
static unsigned long long access_bench1(const sparse_vector_u32 &hist_sv, const std::vector< bvector_type::size_type > &sample_vec, unsigned sampling_size)
Access benchmark 1.
Definition: xsample09.cpp:273
static void generate_test_data(rsc_sparse_vector_u32 &csv, unsigned size)
Generate a test RSC vector with a randomly distributed values imitating distribution density of genom...
Definition: xsample09.cpp:83
static void generate_access_samples(std::vector< bvector_type::size_type > &sample_vec, unsigned size)
generate list of random indexes (locations) to read histogram values
Definition: xsample09.cpp:124
static unsigned long long access_bench2(const rsc_sparse_vector_u32 &hist_rsc, const std::vector< bvector_type::size_type > &sample_vec)
Access benchmark 2.
Definition: xsample09.cpp:294
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
Definition: xsample09.cpp:69