72 <<
"BitMagic Dictionary Search Sample (c) 2018" << std::endl
73 <<
"-idict file-name -- input set file to parse" << std::endl
74 <<
"-svout spase vector output -- sparse vector name to save" << std::endl
75 <<
"-svin sparse vector input -- sparse vector file name to load " << std::endl
76 <<
"-remap -- re-mapping of string characters " << std::endl
77 <<
"-xor -- use XOR compression filtering" << std::endl
78 <<
"-diag -- run diagnostics" << std::endl
79 <<
"-bench -- run benchmarks" << std::endl
80 <<
"-timing -- collect timings" << std::endl
100 for (
int i = 1; i < argc; ++i)
102 std::string arg = argv[i];
103 if ((arg ==
"-h") || (arg ==
"--help"))
109 if (arg ==
"-svout" || arg ==
"--svout")
117 std::cerr <<
"Error: -svout requires file name" << std::endl;
123 if (arg ==
"-svin" || arg ==
"--svin")
131 std::cerr <<
"Error: -svin requires file name" << std::endl;
137 if (arg ==
"-idict" || arg ==
"--idict" )
145 std::cerr <<
"Error: -idict requires file name" << std::endl;
151 if (arg ==
"-remap" || arg ==
"--remap" || arg ==
"-r" || arg ==
"--r")
156 if (arg ==
"-xor" || arg ==
"--xor" || arg ==
"-x" || arg ==
"--x")
161 if (arg ==
"-diag" || arg ==
"--diag" || arg ==
"-d" || arg ==
"--d")
166 if (arg ==
"-timing" || arg ==
"--timing" || arg ==
"-t" || arg ==
"--t")
171 if (arg ==
"-bench" || arg ==
"--bench" || arg ==
"-b" || arg ==
"--b")
177 std::cerr <<
"Error: unknown argument: " << arg << std::endl;
202 std::ifstream fin(fname.c_str(), std::ios::in);
207 std::regex reg(
"[|]");
208 std::sregex_token_iterator it_end;
210 string trim_chars(
"\" ");
212 for (
unsigned i = 0; std::getline(fin, line); ++i)
214 if (line.empty() || !isdigit(line.front()))
218 std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
219 std::vector<std::string> line_vec(it, it_end);
220 if (line_vec.empty())
225 string& col13 = line_vec.at(13);
226 col13.erase(0, col13.find_first_not_of(trim_chars));
227 col13.erase(col13.find_last_not_of(trim_chars) + 1);
230 str_vec.emplace_back(col13);
232 catch (std::exception&)
238 cout <<
"\rReading input file: " << i << flush;
250 if (str_vec.size() != str_sv.
size())
251 throw runtime_error(
"Error. size() comparison failed!");
256 const string& s_control = str_vec[i];
259 cout <<
"idx=" << i << s <<
"!=" << s_control << endl;
260 throw runtime_error(
"Error. element comparison failed!");
263 std::cout <<
"Check ok. Dictionary size = " << str_sv.
size() << std:: endl;
274 std::uniform_int_distribution<unsigned>
rand_dis(0,
unsigned(str_vec.size()-1));
287 if (idx < str_vec.size())
288 bench_vec.push_back(str_vec[idx]);
295 string str_nf = str_vec[idx];
296 string::reverse_iterator rit = str_nf.rbegin();
297 string::reverse_iterator rit_end = str_nf.rend();
298 for (; rit != rit_end; ++rit)
301 int a = rand() % 26 + int(
'A');
304 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), str_nf);
305 if (it == str_vec.end() || *it != str_nf)
307 bench_vec_not_found.push_back(str_nf);
327 cout <<
"Picked " << bench_vec.
size() <<
" / "
328 << bench_vec_not_found.size() <<
" samples. Running benchmarks."
331 unsigned bench_size = unsigned(bench_vec.size());
335 for (
const string& term : bench_vec)
337 auto it = std::lower_bound(str_vec.begin(), str_vec.end(), term);
338 if (it != str_vec.end())
340 string_vector::size_type idx =
341 string_vector::size_type(std::distance(str_vec.begin(), it));
342 bv1.
set(
unsigned(idx));
348 for (
const string& term : bench_vec_not_found)
350 auto p = std::lower_bound(str_vec.begin(), str_vec.end(), term);
357 std::map<string, unsigned> str_map;
358 for (string_vector::size_type i = 0; i < str_vec.size(); ++i)
360 const string& s = str_vec[i];
361 str_map[s] = unsigned(i);
365 for (
const string& term : bench_vec)
367 auto it = str_map.find(term);
368 if (it != str_map.end())
370 bv2.
set(
unsigned(it->second));
376 for (
const string& term : bench_vec_not_found)
378 auto it = str_map.find(term);
379 if (it != str_map.end())
381 cerr <<
"empty search returned value..." << endl;
391 for (
const string& term : bench_vec)
394 bool found = scanner.
find_eq_str(str_sv, term.c_str(), pos);
403 for (
const string& term : bench_vec_not_found)
406 bool found = scanner.
find_eq_str(str_sv, term.c_str(), pos);
409 cerr <<
"scanner empty search returned value..." << endl;
417 scanner.
bind(str_sv,
true);
421 for (
const string& term : bench_vec)
433 for (
const string& term : bench_vec_not_found)
439 cerr <<
"scanner empty search returned value..." << endl;
449 throw runtime_error(
"Error. RB-search mismatch!");
452 throw runtime_error(
"Error. scanner mismatch!");
456 throw runtime_error(
"Error. binary scanner mismatch!");
458 if (bv1.
count() != bench_size)
459 throw runtime_error(
"Error. Search result missing elements!");
465int main(
int argc,
char *argv[])
490 cout <<
"Loaded " << str_vec.size() <<
" dictionary names." << endl;
492 std::sort(str_vec.begin(), str_vec.end());
502 for (
const string& term : str_vec)
513 str_sv.
swap(str_sv_remap);
528 cout <<
"Input vector empty!" << endl;
537 str_vec.emplace_back(std::move(s));
550 bool eq = str_sv.
equal(str_sv_control);
553 cerr <<
"Serialization control failed" << endl;
564 print_svector_stat(cout,str_sv,
false);
569 size_t total_size = 0;
570 for (
const string& term : str_vec)
572 total_size += term.size();
574 cout <<
"String dictionary size: "
575 << total_size / 1024 <<
"KB (" << total_size / (1024*1024) <<
"MB)"
579 if (str_sv.
size() && str_vec.size())
581 cout <<
"Run full comparison check..." << endl;
583 cout <<
"Ok" << endl;
594 std::cout << std::endl <<
"Performance:" << std::endl;
598 catch (std::exception& ex)
600 std::cerr <<
"Error:" << ex.what() << std::endl;
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Algorithms for bm::sparse_vector.
Serialization for sparse_vector<>
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
size_type size() const BMNOEXCEPT
Returns bvector's capacity (number of bits it can store)
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
void resize(size_type new_size)
Change size of the bvector.
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Utility class to collect performance measurements and statistics.
std::map< std::string, statistics > duration_map_type
test name to duration map
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
algorithms for sparse_vector scan/search
bool bfind_eq_str(const SV &sv, const value_type *str, size_type &pos)
binary find first sparse vector element (string) Sparse vector must be sorted.
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
void flush()
flush the accumulated buffer.
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void swap(str_sparse_vector &str_sv) BMNOEXCEPT
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
bool empty() const
return true if vector is empty
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void remap_from(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix=0)
Build remapping profile and load content from another sparse vector Remapped vector likely saves memo...
size_type size() const
return size of the vector
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
bvector_type::size_type size_type
std::random_device rand_dev
std::uniform_int_distribution rand_dis(1, int(vector_max))
std::mt19937 gen(rand_dev())
int main(int argc, char *argv[])
static int parse_args(int argc, char *argv[])
static void show_help()
Print help.
vector< string > string_vector
bm::chrono_taker ::duration_map_type timing_map
static int load_dict_report(const std::string &fname, string_vector &str_vec)
Parse the input file and extract dictionary values.
static void run_benchmark(const str_sparse_vect &str_sv, const string_vector &str_vec)
static void pick_benchmark_set(string_vector &bench_vec, string_vector &bench_vec_not_found, const string_vector &str_vec)
Sample a few random terms out of collection.
static void check_sparse(const str_sparse_vect &str_sv, const string_vector &str_vec)
Compare STL vector with bit-transposed container to check correctness.
const unsigned benchmark_max
bm::str_sparse_vector< char, bm::bvector<>, 64 > str_sparse_vect