BitMagic-C++
strsvsample03.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example strsvsample03.cpp
20 Example of how to use bm::str_sparse_vector<> - succinct container for
21 bit-transposed string collections
22
23 \sa bm::str_sparse_vector
24*/
25
26/*! \file strsvsample03.cpp
27 \brief Example: str_sparse_vector<> back insert iterator example
28
29 This example loads sparse vector from an STL container uses character re-mapping
30 to compress, serialize and save container to disk.
31 Example also illustrates how to check memory footprint.
32*/
33
34#include <iostream>
35#include <string>
36#include <vector>
37#include <random>
38#include <algorithm>
39#include <fstream>
40
41#include "bm.h"
42#include "bmstrsparsevec.h"
43#include "bmsparsevec_serial.h"
44#include "bmundef.h" /* clear the pre-proc defines from BM */
45
46using namespace std;
47
49
50// define the sparse vector type for 'char' type using bvector as
51// a container of bits for bit-transposed planes
52// 32 - is maximum string length for this container.
53// Memory allocation is dynamic using sparse techniques, so this number
54// just defines the max capacity.
55//
57
58
59// generate collection of strings from integers and shuffle it
60//
61static
62void generate_string_set(vector<string>& str_vec)
63{
64 const unsigned max_coll = 50000;
65
66 str_vec.resize(0);
67 string str;
68 for (unsigned i = 10; i < max_coll; i += rand() % 3)
69 {
70 str = to_string(i);
71 str_vec.emplace_back(str);
72 } // for i
73
74 // shuffle the data set
75 //
76 std::random_device rd;
77 std::mt19937 g(rd());
78 std::shuffle(str_vec.begin(), str_vec.end(), g);
79}
80
81
82int main(void)
83{
84 try
85 {
86 str_sv_type str_sv;
87
88 vector<string> str_vec;
89 generate_string_set(str_vec);
90 std::sort(str_vec.begin(), str_vec.end()); // sort the input vector
91
92
93 // load sparse vector from an STL container
94 //
95 {
96 size_t vect_size = 0; // approx std::vector<string> memory usage
97 str_sv_type str_sv_tmp; // temp vector
98 {
100 str_sv_tmp.get_back_inserter();
101 for (auto str : str_vec)
102 {
103 bi = str;
104
105 // some approximate estimate of std::string element cost
106 //
107 size_t str_size = str.size() + sizeof(str);
108 vect_size += str_size;
109 }
110
111 // it is important to use flush, because back inserter is
112 // buffering data. Of cause it flashes automatically on
113 // destruction but explicit flush is somewhat better
114 // because of possible exception is thrown here and not from
115 // destructor.
116 //
117
118 bi.flush();
119
120 cout << "STL vector<string> approx.memory consumption:"
121 << vect_size << endl;
122 }
123
124 // calculate memory footprint
125 //
127 str_sv_tmp.calc_stat(&st);
128
129 cout << "Used memory: " << st.memory_used << std::endl;
130
131
132 // final step is re-mapping, which increses chances for
133 // good memory compression.
134 // A side-effect here is that remapping makes container
135 // effectively read-only.
136 //
137 str_sv.remap_from(str_sv_tmp);
138
140 str_sv.optimize(tb); // optimize the vector
141
142 str_sv.calc_stat(&st);
143 cout << "Used memory after remap and optimization: "
144 << st.memory_used
145 << std::endl;
146
147
148 // a slightly different way to do the reampped loading
149 // iterator in this case is used to collect character frequency
150 // tables and do remap on flush() (which is a bit faster)
151 //
152 // another option here is to turn vector into read-only mode
153 // which runs an embedded memory defragmentation algorithm
154 //
155 {
156 str_sv_type str_sv1;
158 str_sv1.get_back_inserter();
159 bi.set_remap(true);
160 for (auto str : str_vec)
161 bi = str;
162 bi.flush();
163
164
165 str_sv1.optimize(tb);
166
167 // freeze is used to turn the vector into read-only (immutable)
168 //
169 str_sv1.freeze();
170
171 assert(str_sv1.is_ro());
172 bool eq = str_sv1.equal(str_sv);
173 assert(eq);
174
175 str_sv1.calc_stat(&st);
176 cout << "Used memory after remap / optimization / freeze: "
177 << st.memory_used
178 << std::endl;
179
180 // construct a vector with remap table derived from another
181 // we can just add the same data into the new vector
182 // or part of the same data (projection) or reorderd data.
183 //
184 // the key thing is that we have to guarantee that the
185 // remapping tables will be the same
186 //
187 {
189 {
190 unsigned cnt = 0;
192 str_sv2.get_back_inserter();
193 for (auto str : str_vec)
194 {
195 bi = str;
196 if (++cnt >= 10)
197 break; // pick first 10
198 }
199 bi.flush();
200 }
201 str_sv2.optimize();
202 cout << "size2=" << str_sv2.size() << endl; // 10
203 }
204
205 }
206
207
208 }
209
210 // serialize and save
211 //
212 {
213 std::string fname = "test.sv";
215
217 bm::sparse_vector_serialize(str_sv, sv_lay, tb);
218
219 std::ofstream fout(fname.c_str(), std::ios::binary);
220 if (!fout.good())
221 {
222 return -1;
223 }
224 const char* buf = (char*)sv_lay.buf();
225 fout.write(buf, (unsigned)sv_lay.size());
226 if (!fout.good())
227 {
228 return -1;
229 }
230 fout.close();
231
232 cout << "Saved size: " << sv_lay.size() << endl;
233 }
234
235 }
236 catch(std::exception& ex)
237 {
238 std::cerr << ex.what() << std::endl;
239 return 1;
240 }
241
242
243 return 0;
244}
245
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Serialization for sparse_vector<>
string sparse vector based on bit-transposed matrix
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
Back insert iterator implements buffered insert, faster than generic access assignment.
void flush()
flush the accumulated buffer.
void set_remap(bool flag) BMNOEXCEPT
Method to configure back inserter to collect statistics on optimal character codes.
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
bool is_ro() const BMNOEXCEPT
Returns true if vector is read-only.
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
void calc_stat(struct str_sparse_vector< CharType, BV, STR_SIZE >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void remap_from(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix=0)
Build remapping profile and load content from another sparse vector Remapped vector likely saves memo...
size_type size() const
return size of the vector
void freeze()
Turn sparse vector into immutable mode Read-only (immutable) vector uses less memory and allows faste...
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
@ COPY_RTABLES
copy remap tables only (without data)
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type
bm::bvector bvector_type
int main(void)
static void generate_string_set(vector< string > &str_vec)
size_t memory_used
memory usage for all blocks and service tables
Definition: bmfunc.h:62
layout class for serialization buffer structure
size_t size() const BMNOEXCEPT
return current serialized size
const unsigned char * buf() const BMNOEXCEPT
Return serialization buffer pointer.