Succinct container scanner search using pipeline to run thousands of searches faster one by one scans. scanner::pipeline uses variuous cache and algorithmic optimization techniques to run bulk searches faster.
#include <iostream>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <thread>
#include "bmdbg.h"
using namespace std;
static
unsigned max_coll = 8000000,
unsigned repeat_factor=10)
{
string str;
for (unsigned i = 10; i < max_coll; i+= (rand()&0xF))
{
switch (i & 0xF)
{
case 0: str = "AB"; break;
case 1: str = "GTx"; break;
case 2: str = "cnv"; break;
default: str = "AbY11"; break;
}
str.append(to_string(i));
for (unsigned k = 0; k < repeat_factor; ++k)
{
str_coll.emplace_back(str);
bi = str;
}
}
bi.flush();
}
static
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if (arg == "-nodiag")
{
continue;
}
}
}
int main(
int argc,
char *argv[])
{
try
{
std::vector<string> str_coll;
cout << "Generating the test data... " << flush;
cout << "OK" << endl;
{
cout << "Remapping the data to create compressed vector " << flush;
str_sv0.remap();
str_sv0.optimize(tb);
cout << "OK" << endl;
}
{
cout << "\nStatistics on generated SV:" << endl;
bm::print_svector_stat(cout, str_sv1);
cout << "\nStatistics on remapped/optimized SV:" << endl;
bm::print_svector_stat(cout, str_sv0);
cout << endl << endl;
}
unsigned test_runs = 10000;
std::vector<string> str_test_coll;
{
if (idx >= test_runs)
idx = test_runs/2;
str_test_coll.push_back(str_coll[idx]);
}
assert(str_test_coll.size() == test_runs);
std::vector<unique_ptr<bvector_type> > res_vec1;
cout << "Running benchmark tests.." << endl;
for (int pass = 0; pass < 2; pass++)
{
cout << "PASS = " << pass << ((pass==0) ? " -- remap/optimized" : " -- NOT remapped") << endl;
res_vec1.resize(0);
const str_sv_type* str_sv = (pass==0) ? &str_sv0 : &str_sv1;
{
{
const string& str = str_test_coll[i];
res_vec1.emplace_back(unique_ptr<bvector_type>(bv_res.release()));
}
}
pipe1.options().batch_size = test_runs;
{
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe1.add(str.c_str());
}
pipe1.complete();
}
pipe1_and.set_search_mask(&bv_mask);
pipe1_and.options().batch_size = test_runs;
{
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe1_and.add(str.c_str());
}
pipe1_and.complete();
}
pipe1.options().batch_size = test_runs;
{
bm::chrono_taker tt(cout,
"scanner::pipeline find_eq_str()-count()", test_runs);
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe2.add(str.c_str());
}
pipe2.complete();
}
{
auto& res_vect = pipe1.get_bv_res_vector();
auto& res_vect_and = pipe1_and.get_bv_res_vector();
auto& cnt_vect = pipe2.get_bv_count_vector();
assert(res_vect.size() == cnt_vect.size());
size_t res_sz = res_vect.size();
for (size_t i = 0; i < res_sz; ++i)
{
assert(bv1);
assert(bv);
bool match = bv1->
equal(*bv);
assert(match);
auto c = cnt_vect[i];
(void)cnt; (void)c;
assert(cnt == c);
bv_or_total |= *bv;
{
if (bv_and)
{
auto c1 = bv_and->
count();
assert(c1 == c_and); (void)c1; (void)c_and;
match = bv_and->
equal(bv_m);
assert(match);
}
else
{
assert(!c_and);
}
}
}
}
pipe1.options().batch_size = test_runs;
pipe3.set_or_target(&bv_or);
{
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe3.add(str.c_str());
}
pipe3.complete();
}
bool match = bv_or.
equal(bv_or_total);
if (!match)
{
cerr << "OR vector mismatch!" << endl;
exit(1);
}
cout << endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for bm::sparse_vector.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
bool equal(const bvector< Alloc > &bvect) const BMNOEXCEPT
Equal comparison with an agr bit-vector.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_and(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand AND : this := bv1 AND bv2
bvector_size_type size_type
bvector< Alloc > & set_range(size_type left, size_type right, bool value=true)
Sets all bits in the specified closed interval [left,right] Interval must be inside the bvector's siz...
Utility class to collect performance measurements and statistics.
Pipeline to run multiple searches against a particular SV faster using cache optimizations.
algorithms for sparse_vector scan/search
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
size_type size() const
return size of the vector
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
@ use_null
support "non-assigned" or "NULL" logic
BV::size_type count_and(const BV &bv1, const BV &bv2) BMNOEXCEPT
Computes bitcount of AND operation of two bitsets.
int main(int argc, char *argv[])
static void parse_args(int argc, char *argv[])
Rudimentary cmd-args parser.
static void GenerateTestData(std::vector< string > &str_coll, str_sv_type &str_sv, unsigned max_coll=8000000, unsigned repeat_factor=10)
Test data generation.
bm::str_sparse_vector< char, bvector_type, 8 > str_sv_type
bool is_diag
Flag to print the SV diagnostics.
Aggregation options to control execution Default settings are to support only result bit-vector filte...