BitMagic-C++
strsvsample03.cpp

Example of how to use bm::str_sparse_vector<> - succinct container for bit-transposed string collections

See also
bm::str_sparse_vector
/*
Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
For more information please visit: http://bitmagic.io
*/
/** \example strsvsample03.cpp
Example of how to use bm::str_sparse_vector<> - succinct container for
bit-transposed string collections
\sa bm::str_sparse_vector
*/
/*! \file strsvsample03.cpp
\brief Example: str_sparse_vector<> back insert iterator example
This example loads sparse vector from an STL container uses re-mapping
to compress, serialize and save container to disk.
Example also illustrates how to check memory footprint.
*/
#include <iostream>
#include <string>
#include <vector>
#include <random>
#include <algorithm>
#include <fstream>
#include "bm.h"
#include "bmstrsparsevec.h"
#include "bmundef.h" /* clear the pre-proc defines from BM */
using namespace std;
// define the sparse vector type for 'char' type using bvector as
// a container of bits for bit-transposed planes
// 32 - is maximum string length for this container.
// Memory allocation is dynamic using sparse techniques, so this number
// just defines the max capacity.
//
// generate collection of strings from integers and shuffle it
//
static
void generate_string_set(vector<string>& str_vec)
{
const unsigned max_coll = 50000;
str_vec.resize(0);
string str;
for (unsigned i = 10; i < max_coll; i += rand() % 3)
{
str = to_string(i);
str_vec.emplace_back(str);
} // for i
// shuffle the data set
//
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(str_vec.begin(), str_vec.end(), g);
}
int main(void)
{
try
{
str_sv_type str_sv;
vector<string> str_vec;
std::sort(str_vec.begin(), str_vec.end()); // sort the input vector
// load sparse vector from an STL container
//
{
size_t vect_size = 0; // approx std::vector<string> memory usage
str_sv_type str_sv_tmp; // temp vector
{
str_sv_tmp.get_back_inserter();
for (auto str : str_vec)
{
bi = str;
// some approximate estimate of std::string element cost
//
size_t str_size = str.size() + sizeof(str);
vect_size += str_size;
}
// it is important to use flush, because back inserter is
// buffering data. Of cause it flashes automatically on
// destruction but explicit flush is somewhat better
// because of possible exception is thrown here and not from
// destructor.
//
bi.flush();
cout << "STL vector<string> approx.memory consumption:"
<< vect_size << endl;
}
// calculate memory footprint
//
str_sv_tmp.calc_stat(&st);
cout << "Used memory: " << st.memory_used << std::endl;
// final step is re-mapping, which increses chances for
// good memory compression.
// A side-effect here is that remapping makes container
// effectively read-only.
//
str_sv.remap_from(str_sv_tmp);
str_sv.optimize(tb); // optimize the vector
str_sv.calc_stat(&st);
cout << "Used memory after remap and optimization: "
<< std::endl;
}
// serialize and save
//
{
std::string fname = "test.sv";
bm::sparse_vector_serialize(str_sv, sv_lay, tb);
std::ofstream fout(fname.c_str(), std::ios::binary);
if (!fout.good())
{
return -1;
}
const char* buf = (char*)sv_lay.buf();
fout.write(buf, (unsigned)sv_lay.size());
if (!fout.good())
{
return -1;
}
fout.close();
cout << "Saved size: " << sv_lay.size() << endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}