BitMagic-C++
xsample09.cpp

Then it computes variants of density histograms of using presense/absense (NOT NULL bit-vector) and bm::bvector<>::count_range() function.

See also
bm::bvector
bm::bvector::find_reverse
bm::bvector::find
bm::rank_range_split
bm::sparse_vector
bm::rsc_sparse_vector
bm::rsc_sparse_vector::get_back_inserter
/*
Copyright(c) 2020 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
For more information please visit: http://bitmagic.io
*/
/** \example xsample09.cpp
Then it computes variants of density histograms of using
presense/absense (NOT NULL bit-vector) and bm::bvector<>::count_range()
function.
@sa bm::bvector
@sa bm::bvector::find_reverse
@sa bm::bvector::find
@sa bm::rank_range_split
@sa bm::sparse_vector
@sa bm::rsc_sparse_vector
@sa bm::rsc_sparse_vector::get_back_inserter
*/
/*! \file xsample09.cpp
\brief Example: Use succinct vectors for histogram construction
*/
#include <iostream>
#include <memory>
#include <vector>
#include <random>
#include <algorithm>
#include <stdexcept>
using namespace std;
#include "bm.h"
#include "bmtimer.h"
#include "bmsparsevec.h"
#include "bmdbg.h"
#include "bmundef.h" /* clear the pre-proc defines from BM */
// ----------------------------------------------------
// Global parameters and types
// ----------------------------------------------------
const unsigned test_size = 250000000; // number of events (ints) to generate
const unsigned sampling_interval = 2500; // size of the histogram sampling
typedef std::vector<std::pair<bv_size_type, bv_size_type> > bv_ranges_vector;
// timing storage for benchmarking
/// Generate a test RSC vector with a randomly distributed values
/// imitating distribution density of genome variations
/// it adds a huge area of empty in the middle to simulate chr centromere
static
void generate_test_data(rsc_sparse_vector_u32& csv, unsigned size)
{
unsigned cnt = 0;
// part one: dense
{
auto bit = csv.get_back_inserter(); // fastest way to add the data
for (unsigned i = 0; i < size/3; )
{
bit = cnt++;
unsigned nc = (unsigned)(rand() % 16);
bit.add_null(nc);
i+=nc+1;
} // for
bit.flush();
}
// part two: empty + dense again
{
auto bit = csv.get_back_inserter(); // fastest way to add the data
bit.add_null(size/3); // add huge emty space in the middle
for (unsigned i = 0; i < size; )
{
bit = cnt++;
unsigned nc = (unsigned)(rand() % 16);
bit.add_null(nc);
i+=nc+1;
} // for
bit.flush();
}
csv.optimize(tb); // memory compression
csv.sync(); // construct the Rank-Select access index
}
/// generate list of random indexes (locations) to read histogram values
///
static
void generate_access_samples(std::vector<bvector_type::size_type> &sample_vec,
unsigned size)
{
for (unsigned i = 0; i < size; )
{
unsigned sample = (unsigned)(rand() % 256); // random
sample_vec.push_back(sample);
i += sample;
}
std::random_device rd;
std::mt19937 g(rd());
// even more random (unordered)
std::shuffle(sample_vec.begin(), sample_vec.end(), g);
}
/// Compute histogram as a SV vector using fixed sampling interval
///
static
{
assert(sampling_size);
assert(sampling_size < csv.size());
sparse_vector_u32::size_type to = sampling_size-1;
// get NOT NULL vector
assert(bv_null);
{
auto bit = hist_sv.get_back_inserter();
auto sz = csv.size();
do
{
auto cnt = bv_null->count_range(from, to); // closed interval [from..to]
bit = cnt;
from += sampling_size;
to = from + sampling_size - 1;
} while (from < sz);
bit.flush();
}
hist_sv.optimize(tb); // memory compaction
}
/// Compute histogram as a RSC vector using fixed sampling interval.
/// Histogram values are stored as "true" interval start coordinates and
/// it is a more flexible scheme if we eventually decide to use adaptive
/// sampling (variable step).
///
static
{
assert(sampling_size);
assert(sampling_size < csv.size());
{
// important! bm::use_null if vector is later converted to RSC
rsc_sparse_vector_u32::size_type to = sampling_size-1;
assert(bv_null);
auto sz = csv.size();
do
{
auto cnt = bv_null->count_range(from, to); // closed interval [from..to]
hist_sv.set(from, cnt); // assign historgam value at the interval start
from += sampling_size;
to = from + sampling_size - 1;
} while (from < sz);
hist_rsc.load_from(hist_sv);
}
hist_rsc.optimize(tb);
hist_rsc.sync();
}
/// Adaptive histogram identifies number of not NULL elements (events)
/// and varies the size of the histogram bin trying to make sure all
/// bins (but last) are the same weight
///
static
{
assert(sampling_size);
const bvector_type* bv_null = csv.get_null_bvector();
bv_size_type split_rank = sampling_size;
bv_ranges_vector pair_vect;
bm::rank_range_split(*bv_null, split_rank, pair_vect);
size_t sz = pair_vect.size();
for (size_t k = 0; k < sz; ++k)
{
const auto& p = pair_vect[k]; // [from..to] sampling rank interval
hist_rsc.push_back(p.first, 0); // split_rank);
} // for
hist_rsc.optimize(tb);
hist_rsc.sync();
}
/// Some test to confirm correctness
///
static
const sparse_vector_u32& hist_sv,
{
auto en = bv_null->get_enumerator(0);
for (;en.valid(); ++en)
{
auto idx = *en;
rsc_sparse_vector_u32::size_type sv_idx = (idx / sampling_size);
auto v1 = hist_rsc[idx];
auto v2 = hist_sv[sv_idx];
if (v1 != v2)
{
cerr << "Discrepancy at:" << idx << endl;
exit(1);
}
} // for
}
/// Access benchmark 1
///
/// uses regular bit-transposed sparse vector to read
/// histogram values in random order. It relies on fixed inetrval sampling.
///
static
unsigned long long access_bench1(const sparse_vector_u32& hist_sv,
const std::vector<bvector_type::size_type>& sample_vec,
unsigned sampling_size)
{
unsigned long long sum = 0;
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
idx = idx / sampling_size; // interval is calculated by division
auto v = hist_sv[idx];
sum += v;
}
return sum;
}
/// Access benchmark 2
///
/// Uses Rank-Select bit-transposed vector to read histogram values
/// Sampling interval can be non-fixed (variadic, adaptive sampling).
/// Method finds the interval start and value using RSC container not NULL vector
static
unsigned long long access_bench2(const rsc_sparse_vector_u32& hist_rsc,
const std::vector<bvector_type::size_type>& sample_vec)
{
const bvector_type* bv_null = hist_rsc.get_null_bvector();
assert(bv_null);
unsigned long long sum = 0;
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
// search back into container not NULL bit-vector to find sampling
// interval start position
bool found = bv_null->find_reverse(idx, pos);
assert(found); (void)found;
auto v = hist_rsc[pos];
sum += v;
}
return sum;
}
static
void access_bench3(const rsc_sparse_vector_u32& hist_rsc,
const std::vector<bvector_type::size_type>& sample_vec,
const rsc_sparse_vector_u32& rsc_data)
{
// find the last element in the data-set
{
const bvector_type* bv_null = rsc_data.get_null_bvector();
bool found = bv_null->find_reverse(last);
assert(found); (void)found;
}
const bvector_type* bv_null = hist_rsc.get_null_bvector();
assert(bv_null);
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
// search back into container not NULL bit-vector to find sampling
// interval start position
bvector_type::size_type pos_start, pos_end;
bool found = bv_null->find_reverse(idx, pos_start);
assert(found);
found = bv_null->find(idx+1, pos_end);
if (!found)
pos_end = last;
// please note that we don't need number of events here
// (as it is always the same) we need the interval [start......end]
//
// (end - start) / sampling_rank defines the average density
//
}
}
int main(void)
{
try
{
bv_size_type data_set_size = 0;
unsigned hist1_avg = 0;
std::vector<bvector_type::size_type> svec;
{
const bvector_type* bv_null = rsc_test.get_null_bvector();
assert(bv_null);
data_set_size = bv_null->count(); // number of elements in the data set
cout << "Data set size: " << rsc_test.size() << endl;
cout << "Number of elements/events in the data set: " << data_set_size << endl;
}
{
bm::chrono_taker tt1(cout, "01. Histogram 1 construction (SV)) ", 1, &timing_map);
}
// explore some statistics on SV histogram 1
{
sparse_vector_u32::statistics st;
hist1.calc_stat(&st);
cout << "Histogram 1 (SV)" << endl;
cout << " size: " << hist1.size() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
bm::chrono_taker tt1(cout, "05. Serialize and save the histogram 1 (SV)", 1, &timing_map);
size_t serialized_size = 0;
int res = bm::file_save_svector(hist1, "hist1.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
// calculate sum and average value in historgam 1
//
unsigned h1_sum = 0;
auto it = hist1.begin();
auto it_end = hist1.end();
for (; it != it_end; ++it)
h1_sum += *it;
assert (h1_sum == data_set_size);
hist1_avg = h1_sum / hist1.size();
}
{
bm::chrono_taker tt1(cout, "02. Histogram 2 construction (RSC) ", 1, &timing_map);
}
{
rsc_sparse_vector_u32::statistics st;
hist2.calc_stat(&st);
cout << "Histogram 2 (RSC)" << endl;
cout << " size: " << hist2.size() << endl;
cout << " NOT NULL count: " << hist2.get_null_bvector()->count() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
bm::chrono_taker tt1(cout, "06. Serialize and save histogram 2(RSC)", 1, &timing_map);
size_t serialized_size = 0;
int res = bm::file_save_svector(hist2, "hist2.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
}
{
bm::chrono_taker tt1(cout, "03. Histogram 3 (adaptive) construction (RSC) ", 1, &timing_map);
compute_adaptive_rsc_histogram(hist3, rsc_test, hist1_avg);
}
{
rsc_sparse_vector_u32::statistics st;
hist3.calc_stat(&st);
cout << "Histogram 3 adaptive (RSC)" << endl;
cout << " size: " << hist3.size() << endl;
cout << " NOT NULL count: " << hist3.get_null_bvector()->count() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
bm::chrono_taker tt1(cout, "07. Serialize and save the adaptive histogram 3 (RSC)", 1, &timing_map);
size_t serialized_size = 0;
int res = bm::file_save_svector(hist3, "hist3.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
}
cout << endl;
cout << "Access sample size = " << svec.size() << endl;
{
bm::chrono_taker tt1(cout, "04. Verification", 1, &timing_map);
}
unsigned long long sum1(0), sum2(0);
{
bm::chrono_taker tt1(cout, "08. Random access test H1 (SV)", 1, &timing_map);
sum1 = access_bench1(hist1, svec, sampling_interval);
}
{
bm::chrono_taker tt1(cout, "09. Random access test H2 (RSC)", 1, &timing_map);
sum2 = access_bench2(hist2, svec);
}
if (sum1 != sum2) // paranoiya check
{
cerr << "Control sum discrepancy!" << endl;
return 1;
}
{
bm::chrono_taker tt1(cout, "10. Random access test H3 adaptive (RSC)", 1, &timing_map);
access_bench3(hist3, svec, rsc_test);
}
cout << endl;
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
bool find(size_type &pos) const BMNOEXCEPT
Finds index of first 1 bit.
Definition: bm.h:4855
size_type count() const BMNOEXCEPT
population count (count of ON bits)
Definition: bm.h:2366
bvector_size_type size_type
Definition: bm.h:121
bool find_reverse(size_type &pos) const BMNOEXCEPT
Finds last index of 1 bit.
Definition: bm.h:4673
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void push_back(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
back_insert_iterator get_back_inserter()
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
void sync(bool force)
Re-calculate rank-select index for faster access to vector.
void calc_stat(struct rsc_sparse_vector< Val, SV >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL)
size_type size() const BMNOEXCEPT
return size of the vector
succinct sparse vector with runtime compression using bit-slicing / transposition method
Definition: bmsparsevec.h:87
size_type size() const BMNOEXCEPT
return size of the vector
Definition: bmsparsevec.h:711
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
Definition: bmsparsevec.h:1804
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
Definition: bmsparsevec.h:559
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
Definition: bmsparsevec.h:572
void calc_stat(struct sparse_vector< Val, BV >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
Definition: bmsparsevec.h:2078
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
Definition: bmsparsevec.h:2256
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
Definition: bmsparsevec.h:2094
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:229
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
Definition: bmalgo.h:394
bm::bvector ::size_type bv_size_type
Definition: sample24.cpp:54
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
Definition: sample24.cpp:55
const unsigned test_size
Definition: xsample09.cpp:63
bvector_type::size_type bv_size_type
Definition: xsample09.cpp:67
static void verify_histograms(const rsc_sparse_vector_u32 &hist_rsc, const sparse_vector_u32 &hist_sv, sparse_vector_u32::size_type sampling_size)
Some test to confirm correctness.
Definition: xsample09.cpp:246
static void compute_adaptive_rsc_histogram(rsc_sparse_vector_u32 &hist_rsc, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Adaptive histogram identifies number of not NULL elements (events) and varies the size of the histogr...
Definition: xsample09.cpp:220
bm::bvector bvector_type
Definition: xsample09.cpp:66
const unsigned sampling_interval
Definition: xsample09.cpp:64
static void compute_rsc_historgam(rsc_sparse_vector_u32 &hist_rsc, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Compute histogram as a RSC vector using fixed sampling interval.
Definition: xsample09.cpp:180
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
Definition: xsample09.cpp:71
bm::sparse_vector< unsigned, bvector_type > sparse_vector_u32
Definition: xsample09.cpp:68
int main(void)
Definition: xsample09.cpp:357
bm::chrono_taker ::duration_map_type timing_map
Definition: xsample09.cpp:76
static void compute_historgam(sparse_vector_u32 &hist_sv, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Compute histogram as a SV vector using fixed sampling interval.
Definition: xsample09.cpp:143
static void access_bench3(const rsc_sparse_vector_u32 &hist_rsc, const std::vector< bvector_type::size_type > &sample_vec, const rsc_sparse_vector_u32 &rsc_data)
Definition: xsample09.cpp:319
static unsigned long long access_bench1(const sparse_vector_u32 &hist_sv, const std::vector< bvector_type::size_type > &sample_vec, unsigned sampling_size)
Access benchmark 1.
Definition: xsample09.cpp:273
static void generate_test_data(rsc_sparse_vector_u32 &csv, unsigned size)
Generate a test RSC vector with a randomly distributed values imitating distribution density of genom...
Definition: xsample09.cpp:83
static void generate_access_samples(std::vector< bvector_type::size_type > &sample_vec, unsigned size)
generate list of random indexes (locations) to read histogram values
Definition: xsample09.cpp:124
static unsigned long long access_bench2(const rsc_sparse_vector_u32 &hist_rsc, const std::vector< bvector_type::size_type > &sample_vec)
Access benchmark 2.
Definition: xsample09.cpp:294
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
Definition: xsample09.cpp:69