BitMagic-C++
xsample05.cpp
/*
Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
For more information please visit: http://bitmagic.io
*/
/** \example xsample05.cpp
*/
/*! \file xsample05.cpp
\brief Example: Example on how to use bit-transposed string sparse vector
Illustrates how to build a sparse vector, serialize it to disk,
load back and do search or binary hybrid search.
\sa bm::str_sparse_vector<>
*/
#include <iostream>
#include <sstream>
#include <chrono>
#include <regex>
#include <time.h>
#include <stdio.h>
#include <vector>
#include <map>
#include <chrono>
#include <map>
#include <utility>
#include <algorithm>
#include <random>
using namespace std;
//#define BMAVX2OPT
#include "bm.h"
#include "bmalgo.h"
#include "bmserial.h"
#include "bmrandom.h"
#include "bmstrsparsevec.h"
#include "bmdbg.h"
#include "bmtimer.h"
/// Print help
static
void show_help()
{
std::cerr
<< "BitMagic Dictionary Search Sample (c) 2018" << std::endl
<< "-idict file-name -- input set file to parse" << std::endl
<< "-svout spase vector output -- sparse vector name to save" << std::endl
<< "-svin sparse vector input -- sparse vector file name to load " << std::endl
<< "-diag -- run diagnostics" << std::endl
<< "-bench -- run benchmarks" << std::endl
<< "-timing -- collect timings" << std::endl
;
}
// Arguments
//
std::string sv_out_name;
std::string sv_in_name;
std::string i_name;
bool is_diag = false;
bool is_timing = false;
bool is_bench = false;
// Parse command line arguments
static
int parse_args(int argc, char *argv[])
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if ((arg == "-h") || (arg == "--help"))
{
return 0;
}
if (arg == "-svout" || arg == "--svout")
{
if (i + 1 < argc)
{
sv_out_name = argv[++i];
}
else
{
std::cerr << "Error: -svout requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-svin" || arg == "--svin")
{
if (i + 1 < argc)
{
sv_in_name = argv[++i];
}
else
{
std::cerr << "Error: -svin requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-idict" || arg == "--idict" )
{
if (i + 1 < argc)
{
i_name = argv[++i];
}
else
{
std::cerr << "Error: -idict requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
{
is_diag = true;
continue;
}
if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
{
is_timing = true;
continue;
}
if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
{
is_bench = true;
continue;
}
std::cerr << "Error: unknown argument: " << arg << std::endl;
return 1;
} // for i
return 0;
}
// Global types
//
typedef vector<string> string_vector;
/// Parse the input file and extract dictionary values.
///
static
int load_dict_report(const std::string& fname, string_vector& str_vec)
{
bm::chrono_taker tt1("1. parse input data ", 1, &timing_map);
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
std::string line;
std::regex reg("[|]");
std::sregex_token_iterator it_end;
string trim_chars("\" ");
for (unsigned i = 0; std::getline(fin, line); ++i)
{
if (line.empty() || !isdigit(line.front()))
continue;
// regex based tokenizer
std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
std::vector<std::string> line_vec(it, it_end);
if (line_vec.empty())
continue;
try
{
// trip the extra chars
string& col13 = line_vec.at(13);
col13.erase(0, col13.find_first_not_of(trim_chars));
col13.erase(col13.find_last_not_of(trim_chars) + 1);
if (!col13.empty())
str_vec.emplace_back(col13);
}
catch (std::exception&)
{
// just ignore (not ideal, but ok for a sketch ETL)
}
if (i % 10000 == 0)
{
cout << "\rReading input file: " << i << flush;
}
} // for
cout << endl;
return 0;
}
/// Compare STL vector with bit-transposed container to check correcness
///
static
void check_sparse(const str_sparse_vect& str_sv, const string_vector& str_vec)
{
if (str_vec.size() != str_sv.size())
throw runtime_error("Error. size() comparison failed!");
string s;
for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
{
str_sv.get(i, s);
const string& s_control = str_vec[i];
if (s != s_control)
throw runtime_error("Error. element comparison failed!");
} // for
std::cout << "Check ok. Dictionary size = " << str_sv.size() << std:: endl;
}
const unsigned benchmark_max = 15000; // benchmark sampling size
/// Sample a few random terms out of collection
static
void pick_benchmark_set(string_vector& bench_vec, string_vector& bench_vec_not_found, const string_vector& str_vec)
{
std::random_device rand_dev;
std::mt19937 gen(rand_dev()); // mersenne_twister_engine
std::uniform_int_distribution<unsigned> rand_dis(0, unsigned(str_vec.size()-1)); // generate uniform numebrs for [1, vector_max]
bench_vec.resize(0);
for (unsigned i = 0; i < benchmark_max; ++i)
{
unsigned idx;
while (true)
{
idx = unsigned(rand_dis(gen));
if (bv.test(idx)) // make sure benchmark example is not repeated
continue;
if (idx < str_vec.size())
bench_vec.push_back(str_vec[idx]);
break;
}
bv.set(idx); // mark as set
// generate not-found case by modifying a letter in an existing sample
{
string str_nf = str_vec[idx];
string::reverse_iterator rit = str_nf.rbegin();
string::reverse_iterator rit_end = str_nf.rend();
for (; rit != rit_end; ++rit)
{
char ch = *rit;
int a = rand() % 26 + int('A'); // generate random letter
ch = char(a);
*rit = ch;
auto it = std::lower_bound(str_vec.begin(), str_vec.end(), str_nf);
if (it == str_vec.end() || *it != str_nf) // check if not found
{
bench_vec_not_found.push_back(str_nf);
break;
}
} // for rit
}
} // for
cout << endl;
}
static
void run_benchmark(const str_sparse_vect& str_sv, const string_vector& str_vec)
{
string_vector bench_vec;
string_vector bench_vec_not_found; // vector for impossible dictionary items
pick_benchmark_set(bench_vec, bench_vec_not_found, str_vec);
bm::bvector<> bv1, bv2, bv3, bv4;
cout << "Picked " << bench_vec.size() << " / "
<< bench_vec_not_found.size() << " samples. Running benchmarks."
<< endl;
unsigned bench_size = unsigned(bench_vec.size());
{
{
bm::chrono_taker tt1("3. std::lower_bound() search", bench_size, &timing_map);
for (const string& term : bench_vec)
{
auto it = std::lower_bound(str_vec.begin(), str_vec.end(), term);
if (it != str_vec.end())
{
string_vector::size_type idx =
string_vector::size_type(std::distance(str_vec.begin(), it));
bv1.set(unsigned(idx));
}
} // for
}
{
bm::chrono_taker tt2("3a. std::lower_bound() search (empty)", bench_size, &timing_map);
for (const string& term : bench_vec_not_found)
{
std::lower_bound(str_vec.begin(), str_vec.end(), term);
} // for
}
}
{
// construct std::map<> (RB-tree)
std::map<string, unsigned> str_map;
for (string_vector::size_type i = 0; i < str_vec.size(); ++i)
{
const string& s = str_vec[i];
str_map[s] = unsigned(i);
} // for
{
bm::chrono_taker tt1("4. std::map<> search", bench_size, &timing_map);
for (const string& term : bench_vec)
{
auto it = str_map.find(term);
if (it != str_map.end())
{
bv2.set(unsigned(it->second));
}
} // for
}
{
bm::chrono_taker tt2("4a. std::map<> search (empty)", bench_size, &timing_map);
for (const string& term : bench_vec_not_found)
{
auto it = str_map.find(term);
if (it != str_map.end())
{
cerr << "empty search returned value..." << endl;
}
} // for
}
}
{
{
bm::chrono_taker tt1("5. bm::sparse_vector_scanner<> search", bench_size, &timing_map);
for (const string& term : bench_vec)
{
unsigned pos;
bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
if (found)
{
bv3.set(pos);
}
} // for
}
{
bm::chrono_taker tt1("5a. bm::sparse_vector_scanner<> search (empty)", bench_size, &timing_map);
for (const string& term : bench_vec_not_found)
{
unsigned pos;
bool found = scanner.find_eq_str(str_sv, term.c_str(), pos);
if (found)
{
cerr << "scanner empty search returned value..." << endl;
}
} // for
}
}
{
scanner.bind(str_sv, true); // attach SV as permanent search parameter to share cached values
{
bm::chrono_taker tt1("6. bm::sparse_vector_scanner<> binary search", bench_size, &timing_map);
for (const string& term : bench_vec)
{
unsigned pos;
bool found = scanner.bfind_eq_str(term.c_str(), pos);
if (found)
{
bv4.set(pos);
}
} // for
}
{
bm::chrono_taker tt2("6a. bm::sparse_vector_scanner<> binary search (empty)", bench_size, &timing_map);
for (const string& term : bench_vec_not_found)
{
unsigned pos;
bool found = scanner.bfind_eq_str(term.c_str(), pos);
if (found)
{
cerr << "scanner empty search returned value..." << endl;
}
} // for
}
}
// various integrity checks
//
int cmp = bv1.compare(bv2);
if (cmp != 0)
throw runtime_error("Error. RB-search mismatch!");
cmp = bv1.compare(bv3);
if (cmp != 0)
throw runtime_error("Error. scanner mismatch!");
cmp = bv1.compare(bv4);
if (cmp != 0)
throw runtime_error("Error. binary scanner mismatch!");
if (bv1.count() != bench_size)
throw runtime_error("Error. Search result missing elements!");
}
int main(int argc, char *argv[])
{
if (argc < 3)
{
return 1;
}
string_vector str_vec; // dictionary vector (STL)
str_sparse_vect str_sv; // bit-transposed sparse vector
try
{
auto ret = parse_args(argc, argv);
if (ret != 0)
{
return ret;
}
if (!i_name.empty())
{
auto res = load_dict_report(i_name, str_vec);
if (res != 0)
{
return res;
}
cout << "Loaded " << str_vec.size() << " dictionary names." << endl;
std::sort(str_vec.begin(), str_vec.end());
}
if (str_vec.size()) // load the sparse vector
{
bm::chrono_taker tt1("2. build sparse vector", 1, &timing_map);
for (const string& term : str_vec)
str_sv.push_back(term);
// build remapped (dense) sparse vector
// (this should be final), no more edits in this form
{
str_sparse_vect str_sv_remap;
str_sv_remap.remap_from(str_sv);
str_sv.swap(str_sv_remap);
}
str_sv.optimize(tb); // memory optimization after load
}
// save SV vector to file
if (!sv_out_name.empty() && !str_sv.empty())
{
bm::chrono_taker tt1("7. Save sparse vector", 1, &timing_map);
file_save_svector(str_sv, sv_out_name);
}
if (!sv_in_name.empty())
{
{
bm::chrono_taker tt1("8. Load sparse vector", 1, &timing_map);
file_load_svector(str_sv, sv_in_name);
}
if (str_vec.empty())
{
for (str_sparse_vect::size_type i = 0; i < str_sv.size(); ++i)
{
string s;
str_sv.get(i, s);
str_vec.emplace_back(std::move(s));
} // for
}
}
if (is_diag)
{
if (!str_sv.empty())
{
print_svector_stat(str_sv, true);
}
if (str_vec.size())
{
size_t total_size = 0;
for (const string& term : str_vec)
{
total_size += term.size();
}
cout << "String dictionary size: "
<< total_size / 1024 << "KB (" << total_size / (1024*1024) << "MB)"
<< endl;
}
if (str_sv.size() && str_vec.size())
{
cout << "Run full comparison check..." << endl;
check_sparse(str_sv, str_vec); // run a paranoiya check
cout << "Ok" << endl;
}
}
if (is_bench) // run set of benchmarks
{
run_benchmark(str_sv, str_vec);
}
if (is_timing) // print all collected timings
{
std::cout << std::endl << "Performance:" << std::endl;
}
}
catch (std::exception& ex)
{
std::cerr << "Error:" << ex.what() << std::endl;
return 1;
}
return 0;
}
bmsparsevec_algo.h
Algorithms for bm::sparse_vector.
bmrandom.h
Generation of random subset.
bm::chrono_taker
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
bm::bvector::size
size_type size() const BMNOEXCEPT
Returns bvector's capacity (number of bits it can store)
Definition: bm.h:1272
bm::str_sparse_vector::get
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
Definition: bmstrsparsevec.h:1295
bm::str_sparse_vector::push_back
void push_back(const StrType &str)
push back a string
Definition: bmstrsparsevec.h:532
BM_DECLARE_TEMP_BLOCK
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
sv_out_name
std::string sv_out_name
Definition: xsample05.cpp:84
bm::sparse_vector_scanner
algorithms for sparse_vector scan/search
Definition: bmsparsevec_algo.h:481
bm::bvector::compare
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition: bm.h:3173
bm::sparse_vector_scanner::bfind_eq_str
bool bfind_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
binary find first sparse vector element (string) Sparse vector must be sorted.
Definition: bmsparsevec_algo.h:1561
bmsparsevec_serial.h
Serialization for sparse_vector<>
bmalgo.h
Algorithms for bvector<> (main include)
is_timing
bool is_timing
Definition: xsample05.cpp:88
show_help
static void show_help()
Print help.
Definition: xsample05.cpp:68
is_diag
bool is_diag
Definition: xsample05.cpp:87
bm::str_sparse_vector::empty
bool empty() const
return true if vector is empty
Definition: bmstrsparsevec.h:642
bm::bvector<>
bm::bvector::set
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition: bm.h:3595
bm::str_sparse_vector::size
size_type size() const
return size of the vector
Definition: bmstrsparsevec.h:637
bm::str_sparse_vector
sparse vector for strings with compression using bit transposition method
Definition: bmstrsparsevec.h:56
bm::bvector::count
size_type count() const BMNOEXCEPT
population cout (count of ON bits)
Definition: bm.h:2208
str_sparse_vect
bm::str_sparse_vector< char, bm::bvector<>, 64 > str_sparse_vect
Definition: xsample05.cpp:173
bmstrsparsevec.h
string sparse vector based on bit-transposed matrix
bmsparsevec_util.h
gen
std::mt19937 gen(rand_dev())
benchmark_max
const unsigned benchmark_max
Definition: xsample05.cpp:248
timing_map
bm::chrono_taker::duration_map_type timing_map
Definition: xsample05.cpp:178
bm::chrono_taker::ct_time
@ ct_time
Definition: bmtimer.h:58
bmtimer.h
Timing utilities for benchmarking (internal)
sv_in_name
std::string sv_in_name
Definition: xsample05.cpp:85
bm::str_sparse_vector::size_type
bvector_type::size_type size_type
Definition: bmstrsparsevec.h:63
bm::str_sparse_vector::swap
void swap(str_sparse_vector &str_sv) BMNOEXCEPT
Definition: bmstrsparsevec.h:1146
parse_args
static int parse_args(int argc, char *argv[])
Definition: xsample05.cpp:93
is_bench
bool is_bench
Definition: xsample05.cpp:89
bm::sparse_vector_scanner::find_eq_str
bool find_eq_str(const SV &sv, const typename SV::value_type *str, typename SV::size_type &pos)
find first sparse vector element (string)
Definition: bmsparsevec_algo.h:1515
bm::bvector::resize
void resize(size_type new_size)
Change size of the bvector.
Definition: bm.h:2270
bm::bvector::test
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition: bm.h:1454
bm::chrono_taker::duration_map_type
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
bmserial.h
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
bm::chrono_taker::print_duration_map
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:127
bmalgo_similarity.h
pick_benchmark_set
static void pick_benchmark_set(string_vector &bench_vec, string_vector &bench_vec_not_found, const string_vector &str_vec)
Sample a few random terms out of collection.
Definition: xsample05.cpp:252
check_sparse
static void check_sparse(const str_sparse_vect &str_sv, const string_vector &str_vec)
Compare STL vector with bit-transposed container to check correcness.
Definition: xsample05.cpp:233
i_name
std::string i_name
Definition: xsample05.cpp:86
string_vector
vector< string > string_vector
Definition: xsample05.cpp:174
bm::sparse_vector_scanner::bind
void bind(const SV &sv, bool sorted)
bind sparse vector for all searches
Definition: bmsparsevec_algo.h:1118
main
int main(int argc, char *argv[])
Definition: xsample05.cpp:447
bm::str_sparse_vector::optimize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *stat=0)
run memory optimization for all vector plains
Definition: bmstrsparsevec.h:1323
bm.h
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
run_benchmark
static void run_benchmark(const str_sparse_vect &str_sv, const string_vector &str_vec)
Definition: xsample05.cpp:300
load_dict_report
static int load_dict_report(const std::string &fname, string_vector &str_vec)
Parse the input file and extract dictionary values.
Definition: xsample05.cpp:183
rand_dis
std::uniform_int_distribution rand_dis(1, int(vector_max))
bm::str_sparse_vector::remap_from
void remap_from(const str_sparse_vector &str_sv)
Build remapping profile and load content from another sparse vector.
Definition: bmstrsparsevec.h:1600
rand_dev
std::random_device rand_dev
Definition: sample11.cpp:49