BitMagic-C++
rscsample05.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2020 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example rscsample05.cpp
20 Example of how to use collaborative compression (XOR compression)
21 for a group of bit-transposed sparse vectors in a data-frame
22
23 \sa bm::sparse_vector
24 \sa bm::rsc_sparse_vector
25 \sa bm::sparse_vector_serializer
26 \sa bm::sparse_vector_deserializer
27
28 \sa rscsample02.cpp
29 \sa svsample02.cpp
30*/
31
32/*! \file rscsample05.cpp
33 \brief Example: collaborative compression (XOR compression)
34*/
35
36#include <iostream>
37#include <vector>
38#include <cassert>
39
40#include "bm.h"
41#include "bmsparsevec.h"
42#include "bmsparsevec_compr.h"
43#include "bmsparsevec_serial.h"
44
45using namespace std;
46
47// type definitions
48//
49
54
59
60/// sample data-frame structure with multiple bm vectors
61///
63{
67
68 sparse_vector_u16 sv0; // non-compressed sparse vector
69
71 : sv0(bm::use_null)
72 {}
73};
74
75/// Estimate raw data size
76///
77inline
79{
80 size_t sz = 0;
81 if (df.csv1.size())
82 sz += df.csv1.size() * sizeof(df.csv1.get(0));
83 if (df.csv2.size())
84 sz += df.csv2.size() * sizeof(df.csv2.get(0));
85 if (df.csv3.size())
86 sz += df.csv3.size() * sizeof(df.csv3.get(0));
87 if (df.sv0.size())
88 sz += df.sv0.size() * sizeof(df.sv0.get(0));
89 return sz;
90}
91
92/// generate some data just to illustrate the case
93///
94static
96{
97 for (unsigned i = 0; i < 65536; i+=7)
98 {
99 df.csv1.set(i, 4);
100 df.csv2.set(i, 8);
101 df.csv3.set(i, 17);
102 df.sv0.set(i, i % 256);
103 }
104
105 // rebuild Rank-Select index once data are loaded
106 df.csv1.sync();
107 df.csv2.sync();
108 df.csv3.sync();
109}
110
111/// paranoiya check to make sure data frame matches pre-generated values
112static
114{
115 for (unsigned i = 0; i < 65536; i+=7)
116 {
117 auto v1 = df.csv1.get(i);
118 assert(v1 == 4); (void)v1;
119 auto v2 = df.csv2.get(i);
120 assert(v2 == 8); (void)v2;
121 auto v3 = df.csv3.get(i);
122 assert(v3 == 17); (void)v3;
123
124 auto v0 = df.sv0.get(i);
125 assert(v0 == i % 256); (void)v0;
126 }
127}
128
129/**
130 Copy buffer content into the buffer
131 @internal
132 */
133template<typename SVLay>
134unsigned char* copy_buffer(unsigned char* buf_ptr, const SVLay& sv_lay)
135{
136 auto s = sv_lay.size();
137 ::memcpy(buf_ptr, sv_lay.buf(), s);
138 return buf_ptr + s;
139}
140
141/// serialize with disabled XOR compression
142///
143static
145 std::vector<unsigned char>& buf,
146 csv_serializer_type& csv_ser,
147 sv16_serializer_type& sv16_ser)
148{
149 csv_ser.set_xor_ref(false); // disable XOR compression
150
151 // buffers for serialization
154
155 // serialize all data-frame vectors in their natural order (1, 2, 3 ... N)
156 //
157 csv_ser.serialize(df.csv1, sv_lay1);
158 csv_ser.serialize(df.csv2, sv_lay2);
159 csv_ser.serialize(df.csv3, sv_lay3);
160 sv16_ser.serialize(df.sv0, sv_lay0);
161
162 size_t sz = (sizeof(size_t) * 3) +
163 sv_lay1.size() + sv_lay2.size() + sv_lay3.size() + sv_lay0.size();
164
165 buf.resize(sz);
166
167 unsigned char* buf_ptr = buf.data(); // low level access to vector memory
168
169 // serialize data-frame header with sizes of containers
170 {
171 auto s = sv_lay1.size();
172 ::memcpy(buf_ptr, &s, sizeof(s));
173 buf_ptr += sizeof(s);
174 }
175 {
176 auto s = sv_lay2.size();
177 ::memcpy(buf_ptr, &s, sizeof(s));
178 buf_ptr += sizeof(s);
179 }
180 {
181 auto s = sv_lay3.size();
182 ::memcpy(buf_ptr, &s, sizeof(s));
183 buf_ptr += sizeof(s);
184 }
185 {
186 auto s = sv_lay0.size();
187 ::memcpy(buf_ptr, &s, sizeof(s));
188 buf_ptr += sizeof(s);
189 }
190
191 // save all serialization buffers to one BLOB
192 buf_ptr = copy_buffer(buf_ptr, sv_lay1);
193 buf_ptr = copy_buffer(buf_ptr, sv_lay2);
194 buf_ptr = copy_buffer(buf_ptr, sv_lay3);
195 buf_ptr = copy_buffer(buf_ptr, sv_lay0);
196
197}
198
199/// Simple (individual) de-serialization of vectors in the data-frame
200///
201static
203 const std::vector<unsigned char>& buf,
204 csv_deserializer_type& csv_dser,
205 sv16_deserializer_type& sv16_dser)
206{
207 assert(buf.size() > sizeof(size_t)*3);
208
209 size_t sz1, sz2, sz3, sz0;
210 const unsigned char* data_ptr = buf.data();
211
212 // read the header to be able to calculate deserialization
213 // offsets within BLOB
214 //
215 ::memcpy(&sz1, data_ptr, sizeof(size_t));
216 data_ptr += sizeof(size_t);
217 ::memcpy(&sz2, data_ptr, sizeof(size_t));
218 data_ptr += sizeof(size_t);
219 ::memcpy(&sz3, data_ptr, sizeof(size_t));
220 data_ptr += sizeof(size_t);
221 ::memcpy(&sz0, data_ptr, sizeof(size_t));
222 data_ptr += sizeof(size_t);
223
224 csv_dser.deserialize(df.csv1, data_ptr);
225 data_ptr += sz1;
226
227 csv_dser.deserialize(df.csv2, data_ptr);
228 data_ptr += sz2;
229
230 csv_dser.deserialize(df.csv3, data_ptr);
231 data_ptr += sz3;
232
233 sv16_dser.deserialize(df.sv0, data_ptr);
234 data_ptr += sz0;
235
236}
237
238
239
240
241/// Data frame serialization using collaborative method (XOR compression)
242///
243static
245 std::vector<unsigned char>& buf,
246 csv_serializer_type& csv_ser,
247 sv16_serializer_type& sv16_ser)
248{
249 try
250 {
251 // reference vector(s) to keep all bit-planes for collaborative compresson
253
254 // Build the list of reference vectors participating in our data-frame
255 // Important: add references in reverse (sic!) order (N ... 3, 2, 1)
256 // Important: add all data frame vectors
257
258 bv_ref.add_sparse_vector(df.sv0); // (!) last vectors is added first
259 bv_ref.add_sparse_vector(df.csv3);
260 bv_ref.add_sparse_vector(df.csv2);
261 bv_ref.add_sparse_vector(df.csv1);
262
263 csv_ser.set_xor_ref(&bv_ref); // connect reference vector to serializer
264 sv16_ser.set_xor_ref(&bv_ref); // connect reference vector to sv16 serializer
265
266 // compute XOR similarity model - it is common for all serializers
267 // and must be added after set_xor_ref()
268 //
269 bm::xor_sim_params x_params;
271 csv_ser.compute_sim_model(sim_model, bv_ref, x_params);
272
273 // add similarity model to each serializer
274 //
275 csv_ser.set_sim_model(&sim_model);
276 sv16_ser.set_sim_model(&sim_model);
277
278 // buffers for serialization
281
282 // serialize all data-frame vectors in their natural order (1, 2, 3 ... N)
283 //
284 csv_ser.serialize(df.csv1, sv_lay1);
285 csv_ser.serialize(df.csv2, sv_lay2);
286 csv_ser.serialize(df.csv3, sv_lay3);
287 sv16_ser.serialize(df.sv0, sv_lay0);
288
289 size_t sz = (sizeof(size_t) * 4) +
290 sv_lay1.size() + sv_lay2.size() + sv_lay3.size() + sv_lay0.size();
291
292 buf.resize(sz);
293
294 unsigned char* buf_ptr = buf.data(); // low level access to vector memory
295
296 // serialize data-frame header with sizes of containers
297 {
298 auto s = sv_lay1.size();
299 ::memcpy(buf_ptr, &s, sizeof(s));
300 buf_ptr += sizeof(s);
301 }
302 {
303 auto s = sv_lay2.size();
304 ::memcpy(buf_ptr, &s, sizeof(s));
305 buf_ptr += sizeof(s);
306 }
307 {
308 auto s = sv_lay3.size();
309 ::memcpy(buf_ptr, &s, sizeof(s));
310 buf_ptr += sizeof(s);
311 }
312 {
313 auto s = sv_lay0.size();
314 ::memcpy(buf_ptr, &s, sizeof(s));
315 buf_ptr += sizeof(s);
316 }
317
318 // save all serialization buffers to one BLOB
319 //
320 buf_ptr = copy_buffer(buf_ptr, sv_lay1);
321 buf_ptr = copy_buffer(buf_ptr, sv_lay2);
322 buf_ptr = copy_buffer(buf_ptr, sv_lay3);
323 buf_ptr = copy_buffer(buf_ptr, sv_lay0);
324
325
326 // if serializer is re-used we need to disconnect it from the
327 // current frame reference vectors
328 csv_ser.set_xor_ref(nullptr);
329 sv16_ser.set_xor_ref(nullptr);
330 }
331 catch (...)
332 {
333 // catch block is used to guarantee that serialiers
334 // are not associated with a dead reference vector
335 //
336 csv_ser.set_xor_ref(nullptr);
337 sv16_ser.set_xor_ref(nullptr);
338 throw;
339 }
340}
341
342/// Collaborative de-serialization of vectors in the data-frame
343///
344static
346 const std::vector<unsigned char>& buf,
347 csv_deserializer_type& csv_dser,
348 sv16_deserializer_type& sv16_dser)
349{
350 assert(buf.size() > sizeof(size_t)*3);
351
352 try
353 {
354 size_t sz1, sz2, sz3, sz0;
355 const unsigned char* data_ptr = buf.data();
356
357 // read the header to be able to calculate deserialization
358 // offsets within BLOB
359 //
360 ::memcpy(&sz1, data_ptr, sizeof(size_t));
361 data_ptr += sizeof(size_t);
362 ::memcpy(&sz2, data_ptr, sizeof(size_t));
363 data_ptr += sizeof(size_t);
364 ::memcpy(&sz3, data_ptr, sizeof(size_t));
365 data_ptr += sizeof(size_t);
366 ::memcpy(&sz0, data_ptr, sizeof(size_t));
367 data_ptr += sizeof(size_t);
368
369 // reference vector for XOR deserialization
370 //
372
373 // ----------------------------------------------------------
374 // first pass: build reference vectors, pre-construct vectors
375 //
376 csv_dser.deserialize_structure(df.csv1, data_ptr);
377 data_ptr += sz1;
378 csv_dser.deserialize_structure(df.csv2, data_ptr);
379 data_ptr += sz2;
380 csv_dser.deserialize_structure(df.csv3, data_ptr);
381 data_ptr += sz3;
382 sv16_dser.deserialize_structure(df.sv0, data_ptr);
383 data_ptr += sz0;
384
385 // construct the reference vector
386 // Important: add references in reverse (sic!) order (N ... 3, 2, 1)
387 // Important: add all data frame vectors
388 //
389 bv_ref.add_vectors(df.sv0.get_bmatrix()); // (!) last comes first
390 bv_ref.add_vectors(df.csv3.get_bmatrix());
391 bv_ref.add_vectors(df.csv2.get_bmatrix());
392 bv_ref.add_vectors(df.csv1.get_bmatrix());
393
394 // all de-serializers in the data-frame connect to the same set of refs
395 csv_dser.set_xor_ref(&bv_ref);
396 sv16_dser.set_xor_ref(&bv_ref);
397
398
399 // -------------------------------------------------------------
400 // second pass: data deserialization
401 //
402
403 // get deserialization start-pointer again
404 data_ptr = buf.data() + (4 * sizeof(size_t));
405
406
407 // it is important that second pass uses 'false' as a third arument
408 // to keep pre-created vectors structure, which is otherwise destroyed
409 //
410
411 csv_dser.deserialize(df.csv1, data_ptr, false);
412 data_ptr += sz1;
413
414 csv_dser.deserialize(df.csv2, data_ptr, false);
415 data_ptr += sz2;
416
417 csv_dser.deserialize(df.csv3, data_ptr, false);
418 data_ptr += sz3;
419
420 sv16_dser.deserialize(df.sv0, data_ptr, false);
421 data_ptr += sz0;
422
423
424 // disconnect deserialized from the reference vector
425 // before leaving the scope
426 //
427 csv_dser.set_xor_ref(nullptr);
428 sv16_dser.set_xor_ref(nullptr);
429 }
430 catch (...)
431 {
432 // catch block is used to guarantee that de-serialiers
433 // are not associated with a dead reference vector
434 //
435 csv_dser.set_xor_ref(nullptr);
436 sv16_dser.set_xor_ref(nullptr);
437 throw;
438 }
439
440
441}
442
443
444int main(void)
445{
446 try
447 {
448 std::vector<unsigned char> buf0, buf2;
449
450 {
451 sample_data_frame df1; // sample data-frame
452 fill_test_data(df1); // add some test data
453 test_data(df1);
454
455 size_t raw_size = raw_data_size(df1);
456 cout << "raw size: " << raw_size << endl;
457
458 // declare serializers for sparse vectors of different types
459 csv_serializer_type csv_ser;
460 sv16_serializer_type sv16_ser;
461
462 serialize_df0(df1, buf0, csv_ser, sv16_ser);
463 cout << "Plain serializarion: " << buf0.size() << endl;
464
465 serialize_df2(df1, buf2, csv_ser, sv16_ser);
466 cout << "XOR data frame serialization: " << buf2.size() << endl;
467 }
468
469 // de-serialiers for sparse vectors of different types
470 // please note that de-serializers are reusable
471 //
472 csv_deserializer_type csv_dser;
473 sv16_deserializer_type sv16_dser;
474
475 // run simple deserialization here, test results
476 {
477 sample_data_frame df0; // empty data frame to read into
478
479 deserialize_df0(df0, buf0, csv_dser, sv16_dser);
480
481 test_data(df0); // check to make sure we are OK
482 }
483
484 // collaborative deserialization (XOR decode)
485 {
487
488 deserialize_df2(df2, buf2, csv_dser, sv16_dser);
489
490 test_data(df2);
491 }
492
493
494 }
495 catch(std::exception& ex)
496 {
497 std::cerr << ex.what() << std::endl;
498 return 1;
499 }
500
501
502
503 return 0;
504}
505
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
const bmatrix_type & get_bmatrix() const BMNOEXCEPT
Definition: bmbmatrix.h:532
List of reference bit-vectors with their true index associations.
Definition: bmxor.h:624
void add_vectors(const BMATR &bmatr)
Append basic bit-matrix to the list of reference vectors.
Definition: bmxor.h:726
void add_sparse_vector(const SV &sv)
Add bit-transposed sparse vector as a bit-matrix.
Definition: bmxor.h:739
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
const bmatrix_type & get_bmatrix() const BMNOEXCEPT
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
void sync(bool force)
Re-calculate rank-select index for faster access to vector.
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
size_type size() const BMNOEXCEPT
return size of the vector
sparse vector de-serializer
void deserialize(SV &sv, const unsigned char *buf, bool clear_sv=true)
void deserialize_structure(SV &sv, const unsigned char *buf)
void set_xor_ref(bv_ref_vector_type *bv_ref_ptr)
Set external XOR reference vectors (data frame referenece vectors)
void set_xor_ref(bool is_enabled) BMNOEXCEPT
Turn ON and OFF XOR compression of sparse vectors Enables XOR reference compression for the sparse ve...
void compute_sim_model(xor_sim_model_type &sim_model, const bv_ref_vector_type &ref_vect, const bm::xor_sim_params &params)
Calculate XOR similarity model for ref_vector refernece vector must be associated before.
void set_sim_model(const xor_sim_model_type *sim_model) BMNOEXCEPT
Attach serizalizer to a pre-computed similarity model.
void serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout)
Serialize sparse vector into a memory buffer(s) structure.
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
Definition: bmsparsevec.h:1741
size_type size() const BMNOEXCEPT
return size of the vector
Definition: bmsparsevec.h:711
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
Definition: bmsparsevec.h:1804
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:229
Definition: bm.h:78
bm::sparse_vector_serializer< sparse_vector_u16 > sv16_serializer_type
Definition: rscsample05.cpp:56
static void fill_test_data(sample_data_frame &df)
generate some data just to illustrate the case
Definition: rscsample05.cpp:95
bm::bvector bvector_type
Definition: rscsample05.cpp:50
bm::sparse_vector_serializer< rsc_sparse_vector_u32 > csv_serializer_type
Definition: rscsample05.cpp:55
bm::sparse_vector< unsigned, bvector_type > sparse_vector_u32
Definition: rscsample05.cpp:52
static void deserialize_df0(sample_data_frame &df, const std::vector< unsigned char > &buf, csv_deserializer_type &csv_dser, sv16_deserializer_type &sv16_dser)
Simple (individual) de-serialization of vectors in the data-frame.
bm::sparse_vector_deserializer< rsc_sparse_vector_u32 > csv_deserializer_type
Definition: rscsample05.cpp:57
int main(void)
bm::sparse_vector< unsigned short, bvector_type > sparse_vector_u16
Definition: rscsample05.cpp:51
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
Definition: rscsample05.cpp:53
static void test_data(sample_data_frame &df)
paranoiya check to make sure data frame matches pre-generated values
bm::sparse_vector_deserializer< sparse_vector_u16 > sv16_deserializer_type
Definition: rscsample05.cpp:58
unsigned char * copy_buffer(unsigned char *buf_ptr, const SVLay &sv_lay)
Copy buffer content into the buffer.
static void serialize_df0(const sample_data_frame &df, std::vector< unsigned char > &buf, csv_serializer_type &csv_ser, sv16_serializer_type &sv16_ser)
serialize with disabled XOR compression
static void deserialize_df2(sample_data_frame &df, const std::vector< unsigned char > &buf, csv_deserializer_type &csv_dser, sv16_deserializer_type &sv16_dser)
Collaborative de-serialization of vectors in the data-frame.
static void serialize_df2(const sample_data_frame &df, std::vector< unsigned char > &buf, csv_serializer_type &csv_ser, sv16_serializer_type &sv16_ser)
Data frame serialization using collaborative method (XOR compression)
size_t raw_data_size(const sample_data_frame &df)
Estimate raw data size.
Definition: rscsample05.cpp:78
layout class for serialization buffer structure
size_t size() const BMNOEXCEPT
return current serialized size
XOR similarity model.
Definition: bmxor.h:791
Parameters for XOR similarity search.
Definition: bmxor.h:59
sample data-frame structure with multiple bm vectors
Definition: rscsample05.cpp:63
rsc_sparse_vector_u32 csv3
Definition: rscsample05.cpp:66
rsc_sparse_vector_u32 csv2
Definition: rscsample05.cpp:65
sparse_vector_u16 sv0
Definition: rscsample05.cpp:68
rsc_sparse_vector_u32 csv1
Definition: rscsample05.cpp:64