Then it computes variants of density histograms of using presense/absense (NOT NULL bit-vector) and bm::bvector<>::count_range() function.
#include <iostream>
#include <memory>
#include <vector>
#include <random>
#include <algorithm>
#include <stdexcept>
using namespace std;
#include "bmdbg.h"
static
{
unsigned cnt = 0;
{
for (unsigned i = 0; i < size/3; )
{
bit = cnt++;
unsigned nc = (unsigned)(rand() % 16);
bit.add_null(nc);
i+=nc+1;
}
bit.flush();
}
{
bit.add_null(size/3);
for (unsigned i = 0; i < size; )
{
bit = cnt++;
unsigned nc = (unsigned)(rand() % 16);
bit.add_null(nc);
i+=nc+1;
}
bit.flush();
}
}
static
unsigned size)
{
for (unsigned i = 0; i < size; )
{
unsigned sample = (unsigned)(rand() % 256);
sample_vec.push_back(sample);
i += sample;
}
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(sample_vec.begin(), sample_vec.end(), g);
}
static
{
assert(sampling_size);
assert(sampling_size < csv.
size());
assert(bv_null);
{
do
{
auto cnt = bv_null->count_range(from, to);
bit = cnt;
from += sampling_size;
to = from + sampling_size - 1;
} while (from < sz);
bit.flush();
}
}
static
{
assert(sampling_size);
assert(sampling_size < csv.
size());
{
assert(bv_null);
do
{
auto cnt = bv_null->count_range(from, to);
from += sampling_size;
to = from + sampling_size - 1;
} while (from < sz);
}
}
static
{
assert(sampling_size);
size_t sz = pair_vect.size();
for (size_t k = 0; k < sz; ++k)
{
const auto& p = pair_vect[k];
}
}
static
{
auto en = bv_null->get_enumerator(0);
for (;en.valid(); ++en)
{
auto idx = *en;
auto v1 = hist_rsc[idx];
auto v2 = hist_sv[sv_idx];
if (v1 != v2)
{
cerr << "Discrepancy at:" << idx << endl;
exit(1);
}
}
}
static
const std::vector<bvector_type::size_type>& sample_vec,
unsigned sampling_size)
{
unsigned long long sum = 0;
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
idx = idx / sampling_size;
auto v = hist_sv[idx];
sum += v;
}
return sum;
}
static
const std::vector<bvector_type::size_type>& sample_vec)
{
assert(bv_null);
unsigned long long sum = 0;
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
assert(found); (void)found;
auto v = hist_rsc[pos];
sum += v;
}
return sum;
}
static
const std::vector<bvector_type::size_type>& sample_vec,
{
{
assert(found); (void)found;
}
assert(bv_null);
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
assert(found);
found = bv_null->
find(idx+1, pos_end);
if (!found)
pos_end = last;
}
}
{
try
{
unsigned hist1_avg = 0;
std::vector<bvector_type::size_type> svec;
{
assert(bv_null);
data_set_size = bv_null->
count();
cout <<
"Data set size: " << rsc_test.
size() << endl;
cout << "Number of elements/events in the data set: " << data_set_size << endl;
}
{
}
{
sparse_vector_u32::statistics st;
cout << "Histogram 1 (SV)" << endl;
cout <<
" size: " << hist1.
size() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
size_t serialized_size = 0;
int res = bm::file_save_svector(hist1, "hist1.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
unsigned h1_sum = 0;
auto it_end = hist1.
end();
for (; it != it_end; ++it)
h1_sum += *it;
assert (h1_sum == data_set_size);
hist1_avg = h1_sum / hist1.
size();
}
{
}
{
rsc_sparse_vector_u32::statistics st;
cout << "Histogram 2 (RSC)" << endl;
cout <<
" size: " << hist2.
size() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
size_t serialized_size = 0;
int res = bm::file_save_svector(hist2, "hist2.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
}
{
}
{
rsc_sparse_vector_u32::statistics st;
cout << "Histogram 3 adaptive (RSC)" << endl;
cout <<
" size: " << hist3.
size() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
size_t serialized_size = 0;
int res = bm::file_save_svector(hist3, "hist3.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
}
cout << endl;
cout << "Access sample size = " << svec.size() << endl;
{
}
unsigned long long sum1(0), sum2(0);
{
}
{
}
if (sum1 != sum2)
{
cerr << "Control sum discrepancy!" << endl;
return 1;
}
{
}
cout << endl;
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Compressed sparse container rsc_sparse_vector<> for integer types.
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
bool find(size_type &pos) const BMNOEXCEPT
Finds index of first 1 bit.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bvector_size_type size_type
bool find_reverse(size_type &pos) const BMNOEXCEPT
Finds last index of 1 bit.
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void push_back(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
back_insert_iterator get_back_inserter()
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
void sync(bool force)
Re-calculate rank-select index for faster access to vector.
void calc_stat(struct rsc_sparse_vector< Val, SV >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values (or NULL)
size_type size() const BMNOEXCEPT
return size of the vector
SV::bvector_type bvector_type
succinct sparse vector with runtime compression using bit-slicing / transposition method
bvector_type::size_type size_type
size_type size() const BMNOEXCEPT
return size of the vector
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
void calc_stat(struct sparse_vector< Val, BV >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
@ use_null
support "non-assigned" or "NULL" logic
void rank_range_split(const BV &bv, typename BV::size_type rank, PairVect &target_v)
Algorithm to identify bit-vector ranges (splits) for the rank.
bm::bvector ::size_type bv_size_type
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
bvector_type::size_type bv_size_type
static void verify_histograms(const rsc_sparse_vector_u32 &hist_rsc, const sparse_vector_u32 &hist_sv, sparse_vector_u32::size_type sampling_size)
Some test to confirm correctness.
static void compute_adaptive_rsc_histogram(rsc_sparse_vector_u32 &hist_rsc, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Adaptive histogram identifies number of not NULL elements (events) and varies the size of the histogr...
const unsigned sampling_interval
static void compute_rsc_historgam(rsc_sparse_vector_u32 &hist_rsc, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Compute histogram as a RSC vector using fixed sampling interval.
std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector
bm::sparse_vector< unsigned, bvector_type > sparse_vector_u32
bm::chrono_taker ::duration_map_type timing_map
static void compute_historgam(sparse_vector_u32 &hist_sv, const rsc_sparse_vector_u32 &csv, sparse_vector_u32::size_type sampling_size)
Compute histogram as a SV vector using fixed sampling interval.
static void access_bench3(const rsc_sparse_vector_u32 &hist_rsc, const std::vector< bvector_type::size_type > &sample_vec, const rsc_sparse_vector_u32 &rsc_data)
static unsigned long long access_bench1(const sparse_vector_u32 &hist_sv, const std::vector< bvector_type::size_type > &sample_vec, unsigned sampling_size)
Access benchmark 1.
static void generate_test_data(rsc_sparse_vector_u32 &csv, unsigned size)
Generate a test RSC vector with a randomly distributed values imitating distribution density of genom...
static void generate_access_samples(std::vector< bvector_type::size_type > &sample_vec, unsigned size)
generate list of random indexes (locations) to read histogram values
static unsigned long long access_bench2(const rsc_sparse_vector_u32 &hist_rsc, const std::vector< bvector_type::size_type > &sample_vec)
Access benchmark 2.
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32