BitMagic-C++
strsvsample03.cpp

Example of how to use bm::str_sparse_vector<> - succinct container for bit-transposed string collections

See also
bm::str_sparse_vector
/*
Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
For more information please visit: http://bitmagic.io
*/
/** \example strsvsample03.cpp
Example of how to use bm::str_sparse_vector<> - succinct container for
bit-transposed string collections
\sa bm::str_sparse_vector
*/
/*! \file strsvsample03.cpp
\brief Example: str_sparse_vector<> back insert iterator example
This example loads sparse vector from an STL container uses re-mapping
to compress, serialize and save container to disk.
Example also illustrates how to check memory footprint.
*/
#include <iostream>
#include <string>
#include <vector>
#include <random>
#include <algorithm>
#include <fstream>
#include "bm.h"
#include "bmstrsparsevec.h"
using namespace std;
// define the sparse vector type for 'char' type using bvector as
// a container of bits for bit-transposed planes
// 32 - is maximum string length for this container.
// Memory allocation is dynamic using sparse techniques, so this number
// just defines the max capacity.
//
// generate collection of strings from integers and shuffle it
//
static
void generate_string_set(vector<string>& str_vec)
{
const unsigned max_coll = 50000;
str_vec.resize(0);
string str;
for (unsigned i = 10; i < max_coll; i += rand() % 3)
{
str = to_string(i);
str_vec.emplace_back(str);
} // for i
// shuffle the data set
//
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(str_vec.begin(), str_vec.end(), g);
}
int main(void)
{
try
{
str_sv_type str_sv;
vector<string> str_vec;
std::sort(str_vec.begin(), str_vec.end()); // sort the input vector
// load sparse vector from an STL container
//
{
size_t vect_size = 0; // approx std::vector<string> memory usage
str_sv_type str_sv_tmp; // temp vector
{
str_sv_tmp.get_back_inserter();
for (auto str : str_vec)
{
bi = str;
// some approximate estimate of std::string element cost
//
size_t str_size = str.size() + sizeof(str);
vect_size += str_size;
}
// it is important to use flush, because back inserter is
// buffering data. Of cause it flashes automatically on
// destruction but explicit flush is somewhat better
// because of possible exception is thrown here and not from
// destructor.
//
bi.flush();
cout << "STL vector<string> approx.memory consumption:"
<< vect_size << endl;
}
// calculate memory footprint
//
str_sv_tmp.calc_stat(&st);
cout << "Used memory: " << st.memory_used << std::endl;
// final step is re-mapping, which increses chances for
// good memory compression.
// A side-effect here is that remapping makes container
// effectively read-only.
//
str_sv.remap_from(str_sv_tmp);
str_sv.optimize(tb); // optimize the vector
str_sv.calc_stat(&st);
cout << "Used memory after remap and optimization: "
<< std::endl;
}
// serialize and save
//
{
std::string fname = "test.sv";
bm::sparse_vector_serialize(str_sv, sv_lay, tb);
std::ofstream fout(fname.c_str(), std::ios::binary);
if (!fout.good())
{
return -1;
}
const char* buf = (char*)sv_lay.buf();
fout.write(buf, (unsigned)sv_lay.size());
if (!fout.good())
{
return -1;
}
fout.close();
cout << "Saved size: " << sv_lay.size() << endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
bm::str_sparse_vector::back_insert_iterator::flush
void flush()
flush the accumulated buffer
Definition: bmstrsparsevec.h:1895
BM_DECLARE_TEMP_BLOCK
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
bm::sparse_vector_serial_layout::buf
const unsigned char * buf() const
Return serialization buffer pointer.
Definition: bmsparsevec_serial.h:106
main
int main(void)
Definition: strsvsample03.cpp:82
bm::str_sparse_vector::calc_stat
void calc_stat(struct str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
Definition: bmstrsparsevec.h:1338
bmsparsevec_serial.h
Serialization for sparse_vector<>
bm::sparse_vector_serialize
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
Definition: bmsparsevec_serial.h:356
bm::sparse_vector_serial_layout
layout class for serialization buffer structure
Definition: bmsparsevec_serial.h:57
bm::bvector<>
generate_string_set
static void generate_string_set(vector< string > &str_vec)
Definition: strsvsample03.cpp:62
bm::str_sparse_vector
sparse vector for strings with compression using bit transposition method
Definition: bmstrsparsevec.h:56
bmstrsparsevec.h
string sparse vector based on bit-transposed matrix
bm::str_sparse_vector::get_back_inserter
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
Definition: bmstrsparsevec.h:721
str_sv_type
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type
Definition: strsvsample03.cpp:56
bvector_type
bm::bvector bvector_type
Definition: strsvsample03.cpp:48
bm::sparse_vector_serial_layout::size
size_t size() const
return current serialized size
Definition: bmsparsevec_serial.h:84
bm::str_sparse_vector::back_insert_iterator
Back insert iterator implements buffered insert, faster than generic access assignment.
Definition: bmstrsparsevec.h:264
bm::bv_statistics::memory_used
size_t memory_used
memory usage for all blocks and service tables
Definition: bmfunc.h:61
bm::str_sparse_vector::optimize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *stat=0)
run memory optimization for all vector plains
Definition: bmstrsparsevec.h:1323
bm.h
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
bm::str_sparse_vector::remap_from
void remap_from(const str_sparse_vector &str_sv)
Build remapping profile and load content from another sparse vector.
Definition: bmstrsparsevec.h:1600
bm::str_sparse_vector::statistics
Definition: bmstrsparsevec.h:72