BitMagic-C++
inv_list.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2020 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example inv_list.cpp
20 Utility to compress test sets of inverted lists (Gov2 corpus)
21 and study compression characteristics and different comression levels
22*/
23
24/*! \file inv_list.cpp
25 \brief Utility to compress test sets of inverted lists
26*/
27
28#include <iostream>
29#include <chrono>
30#include <thread>
31#include <time.h>
32#include <stdio.h>
33#include <cstdlib>
34
35
36#ifdef _MSC_VER
37#pragma warning( push )
38#pragma warning( disable : 4996)
39#endif
40
41#include <vector>
42#include <chrono>
43#include <map>
44
45#include "bm.h"
46#include "bmalgo.h"
47#include "bmserial.h"
48#include "bmsparsevec.h"
49#include "bmsparsevec_compr.h"
50#include "bmsparsevec_algo.h"
51#include "bmsparsevec_serial.h"
52#include "bmalgo_similarity.h"
53
54
55#include "bmdbg.h"
56#include "bmtimer.h"
57#include "bmundef.h" /* clear the pre-proc defines from BM */
58
59using namespace std;
60
61static
63{
64 std::cout
65 << "BitMagic Inverted List Compression Test (c) 2019" << std::endl
66 << "-u32in u32-input-file -- raw 32-bit unsigned int file" << std::endl
67 << "-bvout bvect-output-file -- bit-vector compressed out file" << std::endl
68 << "-svout svect-output-file -- bit-transposed sparse vectors out file" << std::endl
69 << "-bvin bvect-input-file -- bit-vector compressed in file" << std::endl
70 << std::endl
71 << "-level N -- compression level to use (up to 5)" << std::endl
72 << "-silent (-s) -- no progress print or messages" << std::endl
73 << "-verify -- verify compressed version " << std::endl
74 << "-decode -- run decode test (in-memory)" << std::endl
75 << "-diag (-d) -- print statistics/diagnostics info" << std::endl
76 << "-timing (-t) -- evaluate timing/duration of operations" << std::endl
77 ;
78}
79
80
81
82
83// Arguments
84//
85std::string bv_in_file;
86std::string bv_out_file;
87
88std::string sv_in_file;
89std::string sv_out_file;
90std::string u32_in_file;
91std::string u32_out_file;
92
93bool is_diag = false;
94bool is_timing = false;
95bool is_verify = false;
96bool is_silent = false;
97bool is_decode = false;
98
100
101
102static
103int parse_args(int argc, char *argv[])
104{
105 for (int i = 1; i < argc; ++i)
106 {
107 std::string arg = argv[i];
108 if ((arg == "-h") || (arg == "--help"))
109 {
110 show_help();
111 return 0;
112 }
113
114 if (arg == "-v" || arg == "-verify")
115 {
116 if (i + 1 < argc)
117 {
118 is_verify = true;
119 }
120 continue;
121 }
122 if (arg == "-decode")
123 {
124 if (i + 1 < argc)
125 {
126 is_decode = true;
127 }
128 continue;
129 }
130 if (arg == "-l" || arg == "-level")
131 {
132 if (i + 1 < argc)
133 {
134 const char* lvl = argv[++i];
135 char *end;
136 c_level = (unsigned) std::strtoul(lvl, &end, 10);
137 if (errno == ERANGE)
138 {
139 std::cerr << "Error parsing -level: range error for "
140 << lvl<< std::endl;
141 return 1;
142 }
143 }
144 else
145 {
146 std::cerr << "Error: -level requires compression level number" << std::endl;
147 return 1;
148 }
149 continue;
150 }
151
152 if (arg == "-bvout" || arg == "--bvout")
153 {
154 if (i + 1 < argc)
155 {
156 bv_out_file = argv[++i];
157 }
158 else
159 {
160 std::cerr << "Error: -bvout requires file name" << std::endl;
161 return 1;
162 }
163 continue;
164 }
165 if (arg == "-bvin" || arg == "--bvin")
166 {
167 if (i + 1 < argc)
168 {
169 bv_in_file = argv[++i];
170 }
171 else
172 {
173 std::cerr << "Error: -bvin requires file name" << std::endl;
174 return 1;
175 }
176 continue;
177 }
178
179 if (arg == "-svin" || arg == "--svin")
180 {
181 if (i + 1 < argc)
182 {
183 sv_in_file = argv[++i];
184 }
185 else
186 {
187 std::cerr << "Error: -svin requires file name" << std::endl;
188 return 1;
189 }
190 continue;
191 }
192
193 if (arg == "-u32in" || arg == "--u32in")
194 {
195 if (i + 1 < argc)
196 {
197 u32_in_file = argv[++i];
198 }
199 else
200 {
201 std::cerr << "Error: -u32in requires file name" << std::endl;
202 return 1;
203 }
204 continue;
205 }
206
207 if (arg == "-svout" || arg == "--svout")
208 {
209 if (i + 1 < argc)
210 {
211 sv_out_file = argv[++i];
212 }
213 else
214 {
215 std::cerr << "Error: -svout requires file name" << std::endl;
216 return 1;
217 }
218 continue;
219 }
220
221
222 if (arg == "-silent" || arg == "--silent" || arg == "-s" || arg == "--s")
223 is_silent = true;
224 if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
225 is_diag = true;
226 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
227 is_timing = true;
228
229
230 } // for i
231 return 0;
232}
233
234
235// Globals
236//
237
240
241
243
244
245/// Read 32-bit vector size-prefix format (length:0, 1, 2, 3, ....)
246///
247template<class VT>
248int io_read_u32_coll(std::ifstream& fin, VT& vec)
249{
250 typedef typename VT::value_type value_type;
251 vec.resize(0);
252 if (!fin.good())
253 return -1;
254 value_type len;
255 fin.read((char*) &len, std::streamsize(sizeof(len)));
256 if (!fin.good())
257 return -1;
258 if (!len)
259 return -2; // 0-len detected (broken file)
260
261 vec.resize(len);
262 fin.read((char*) &vec[0], std::streamsize(len*sizeof(value_type)));
263 if (!fin.good())
264 return -1;
265
266 return 0;
267}
268
269/// Check if input vector is monotonously sorted (true inverted list)
270/// along the way in computes a minimal delta between values
271///
272template<typename VT>
273int validate_inp_vec(const VT& vec,
274 typename VT::value_type& min_delta,
275 typename VT::value_type& min_delta_cnt
276 )
277{
278 typename VT::value_type md_cnt = min_delta_cnt = 0;
279 auto sz = vec.size();
280 if (!sz)
281 return -1;
282 auto i_prev = vec[0];
283 typename VT::value_type md = ~(typename VT::value_type(0)); // max possible uint
284 for (typename VT::size_type i = 1; i < sz; ++i)
285 {
286 auto val = vec[i];
287 if (val <= i_prev)
288 return -2;
289 typename VT::value_type d1 = val - i_prev;
290 if (d1 < md)
291 {
292 md_cnt = 0; md = d1;
293 }
294 md_cnt += (d1 == md);
295 i_prev = val;
296 }
297 min_delta = md;
298 min_delta_cnt = md_cnt;
299 return 0;
300}
301
302/// Verification check if integer vector is equivalent to a bit-vector
303///
304template<typename VT, typename BV>
305int compare_vect(const VT& vec, const BV& bv)
306{
307 if (vec.size() != bv.count())
308 {
309 return -1;
310 }
311 typename BV::enumerator en = bv.first();
312 typename VT::const_iterator it = vec.begin();
313 for (; en.valid(); ++en, ++it)
314 {
315 if (*en != *it)
316 return -1;
317 }
318 return 0;
319}
320
321/// Debug utility to detect super sparse bit-vectors
322/// which probably get bad compression rate
323///
324template<typename BV>
325bool is_super_sparse(const BV& bv)
326{
327 typename BV::statistics st;
328 bv.calc_stat(&st);
329 auto bc = bv.count();
330 auto blocks_count = st.gap_blocks + st.bit_blocks;
331 if (bc <= blocks_count)
332 return true;
333 auto bc_parity = blocks_count * 6;
334 return (bc <= bc_parity);
335}
336
337
338/// convert vector into bit-vector and append to the file
339///
340/// @return true if vector was detected as very low cardinality
341///
342template<typename VT>
343bool write_as_bvector(std::ofstream& bv_file,
344 const VT& vec,
346 bm::serializer<bm::bvector<> >::buffer& sbuf)
347{
349 bm::bvector<> bv;
350 bv.set(&vec[0], bm::bvector<>::size_type(vec.size()), bm::BM_SORTED);
351
352 bv.optimize(tb);
353 bvs.serialize(bv, sbuf);
354
355 unsigned bv_size = (unsigned)sbuf.size();
356 bv_file.write((char*)&bv_size, sizeof(bv_size));
357 bv_file.write((char*)sbuf.data(), (std::streamsize)sbuf.size());
358 if (!bv_file.good())
359 throw std::runtime_error("Error write to bvect out file");
360 return false;
361}
362
363/// convert vector into delta coded bit-transposed vector and append to the file
364///
365template<typename VT>
366void write_as_svector(std::ofstream& sv_file,
367 const VT& vec,
368 unsigned min_delta,
370{
371
373 bm::id_t prev = vec[0];
374
375 {
377 sv_bi.add(min_delta);
378 sv_bi.add(prev);
379
380 for (unsigned k = 1; k < vec.size(); ++k)
381 {
382 bm::id_t curr = vec[k];
383 bm::id_t delta = curr - prev;
384 if (delta < min_delta)
385 throw std::runtime_error("Input vector validation delta error");
386 delta -= min_delta;
387 sv_bi.add(delta);
388 prev = curr;
389 } // for i
390 sv_bi.flush();
391 }
392
394 sv.optimize(tb);
395 bm::sparse_vector_serialize(sv, sv_lay, tb);
396
397 unsigned sv_size = (unsigned)sv_lay.size();
398 sv_file.write((char*)&sv_size, sizeof(sv_size));
399 sv_file.write((char*)sv_lay.data(), (std::streamsize)sv_lay.size());
400 if (!sv_file.good())
401 throw std::runtime_error("Error write to bvect out file");
402
403}
404
405
406/// convert vector into delta coded bit-transposed vector and append to the file
407///
408template<typename VT>
409void write_as_rsc_svector(std::ofstream& sv_file,
410 const VT& vec,
411 unsigned min_delta,
413{
414
416 bm::id_t prev = vec[0];
417
418 {
420 sv_bi.add(min_delta);
421 sv_bi.add(prev);
422
423 for (unsigned k = 1; k < vec.size(); ++k)
424 {
425 bm::id_t curr = vec[k];
426 bm::id_t delta = curr - prev;
427 if (delta < min_delta)
428 throw std::runtime_error("Input vector validation delta error");
429 delta -= min_delta;
430 if (delta)
431 sv_bi.add(delta);
432 else
433 sv_bi.add_null();
434 prev = curr;
435 } // for i
436 sv_bi.flush();
437 }
438
440
441 rsc_sparse_vector_u32 csv; // compressed sparse vector
442 csv.load_from(sv); // load rank-select-compacted (rsc) sparse vector
443 csv.optimize(tb);
444
445 bm::sparse_vector_serialize(csv, sv_lay, tb);
446
447 unsigned sv_size = (unsigned)sv_lay.size();
448 sv_file.write((char*)&sv_size, sizeof(sv_size));
449 sv_file.write((char*)sv_lay.data(), (std::streamsize)sv_lay.size());
450 if (!sv_file.good())
451 throw std::runtime_error("Error write to bvect out file");
452
453}
454
455
456
457/// read the input collection sequence, write using various compression schemes
458///
459static
460void compress_inv_dump_file(const std::string& fname,
461 const std::string& bv_out_fname,
462 const std::string& sv_out_fname)
463{
464 bm::id64_t total_ints = 0;
465 bm::id64_t total_low_card = 0;
466 bm::id64_t total_low_card_size = 0;
467 bm::id64_t min_delta_ints = 0;
468 bm::id64_t sv_size = 0;
469 bm::id64_t rsc_diff_size = 0;
470 bm::id64_t sv_cnt = 0;
471
472 cout << "Reading input collection: " << fname << endl;
473 if (!bv_out_fname.empty())
474 cout << "Writing to BV collection: " << bv_out_fname << endl;
475 else
476 cout << "NO BV collection specified" << endl;
477 if (!sv_out_fname.empty())
478 cout << "Writing to SV collection: " << sv_out_fname << endl;
479 else
480 cout << "NO SV collection specified" << endl;
481
482
483 cout << "Compression level: " << c_level << endl;
484
485 bm::chrono_taker tt1(std::cout, "1. Convert collection", 1, &timing_map);
486
487 vector<unsigned> vec;
488 std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
489 if (!fin.good())
490 {
491 throw std::runtime_error("Cannot open input file");
492 }
493
494 std::ofstream bv_file;
495 if (!bv_out_fname.empty())
496 {
497 bv_file.open(bv_out_fname, std::ios::out | std::ios::binary);
498 if (!bv_file.good())
499 throw std::runtime_error("Cannot open bvect out file");
500 }
501 std::ofstream sv_file;
502 if (!sv_out_fname.empty())
503 {
504 sv_file.open(sv_out_fname, std::ios::out | std::ios::binary);
505 if (!sv_file.good())
506 throw std::runtime_error("Cannot open svect out file");
507 }
508
509
510 fin.seekg(0, std::ios::end);
511 std::streamsize fsize = fin.tellg();
512
513 fin.seekg(0, std::ios::beg);
514
515 // initialize serializer
516 //
518 bvs.byte_order_serialization(false);
519 bvs.gap_length_serialization(false);
521
522 // serialization target for sparse vector
525
526 bm::serializer<bm::bvector<> >::buffer sbuf; // resizable memory buffer
527
528 // main loop to read sample vectors
529 //
530 bm::id64_t i;
531 for (i = 0; true; ++i)
532 {
533 int ret = io_read_u32_coll(fin, vec);
534 if (ret != 0)
535 throw std::runtime_error("Error reading input file");
536 unsigned min_delta, min_delta_cnt;
537
538 {
539 ret = validate_inp_vec(vec, min_delta, min_delta_cnt); // check if we have sorted unique set
540 if (ret != 0)
541 throw std::runtime_error("Input vector validation failed");
542 if (!min_delta || !min_delta_cnt)
543 throw std::runtime_error("Input vector validation delta error");
544
545 min_delta_ints += min_delta_cnt;
546 }
547 total_ints += vec.size(); // remember the total size of the collection
548
549 // serialize and save as a bit-vector size0:<BLOB0>, size1:<BLOB1>...N
550 //
551 bool is_low_card = false;
552 if (!bv_out_fname.empty())
553 {
554 is_low_card = write_as_bvector(bv_file, vec, bvs, sbuf);
555 if (is_low_card)
556 {
557 total_low_card_size += sbuf.size();
558 ++total_low_card;
559 }
560 }
561
562 // commented out experimental (and very slow code) to evaluate rank-select compression
563 #if 0
564 if (!sv_out_fname.empty())
565 {
566 //write_as_svector(sv_file, vec, min_delta, sv_lay);
567 write_as_rsc_svector(sv_file, vec, min_delta, csv_lay);
568 sv_size += sv_lay.size();
569
570 /*
571 rsc_sparse_vector_u32 csv; // compressed sparse vector
572 csv.load_from(sv); // load rank-select-compacted (rsc) sparse vector
573
574 BM_DECLARE_TEMP_BLOCK(tb)
575 csv.optimize(tb);
576 bm::sparse_vector_serial_layout<rsc_sparse_vector_u32> sv_lay;
577 bm::sparse_vector_serialize(csv, sv_lay, tb);
578 if (sv_lay.size() < sbuf.size())
579 {
580 rsc_diff = sbuf.size() - sv_lay.size();
581 rsc_diff_size += rsc_diff;
582 sv_size += sv_lay.size();
583 sv_cnt++;
584 }
585 */
586 }
587 #endif
588
589
590 std::streamsize fpos_curr = fin.tellg();
591 if (fpos_curr == fsize)
592 break;
593
594 if (!is_silent)
595 {
596 cout << "\r" << i << " " << fpos_curr << " / " << fsize
597 << " ( size=" << vec.size() << " ) " << (is_low_card ? " * " : " ")
598 << " sv=" << sv_cnt << " rsc_diff=" << rsc_diff_size
599 << flush;
600 }
601
602 } // for i
603
604 // print statistics about test set
605
606 cout << endl;
607 cout << "Total vectors=" << i << endl;
608 cout << " lo-card=" << total_low_card << endl;
609 cout << " lo-card size = " << total_low_card_size << endl;
610 cout << " SV cnt = " << sv_cnt << endl;
611 cout << " SV size = " << sv_size << endl;
612 cout << " RSC diff = " << rsc_diff_size << endl;
613 cout << "Total ints=" << total_ints << endl;
614 cout << " min-deltas = " << min_delta_ints << endl;
615 {
616 double min_delta_ratio = double(min_delta_ints) / double(total_ints);
617 cout << " min delta ratio = " << std::setprecision(3) << min_delta_ratio << endl;
618 }
619 if (!bv_out_fname.empty())
620 {
621 bm::id64_t bv_size = (bm::id64_t)bv_file.tellp();
622 cout << "BV size = " << bv_size << endl;
623 // calculate bits per int compression ratio corrected not to account
624 // for size/length words prefixing the vectors
625 double bv_bits_per_int = double(bv_size * 8ull - (i*sizeof(unsigned))) / double(total_ints);
626 cout << "BV Bits per/int = " << std::setprecision(3) << bv_bits_per_int << endl;
627
628 bv_file.close();
629 }
630
631 if (!sv_out_fname.empty())
632 {
633 sv_size = (bm::id64_t)sv_file.tellp();
634 cout << "SV size = " << sv_size << endl;
635 // calculate bits per int compression ratio corrected not to account
636 // for size/length words prefixing the vectors
637 double sv_bits_per_int = double(sv_size * 8ull - (i*sizeof(unsigned))) / double(total_ints);
638 cout << "SV Bits per/int = " << std::setprecision(3) << sv_bits_per_int << endl;
639
640 sv_file.close();
641 }
642
643}
644
645// ------------------------------------------------------------------------
646
647/// read and desrialize bit-bector from the dump file
648///
649static
650int read_bvector(std::ifstream& bv_file,
651 bm::bvector<>& bv,
652 bm::serializer<bm::bvector<> >::buffer& sbuf)
653{
654 if (!bv_file.good())
655 return -1;
656 unsigned len;
657 bv_file.read((char*) &len, std::streamsize(sizeof(len)));
658 if (!bv_file.good())
659 return -1;
660 if (!len)
661 return -2; // 0-len detected (broken file)
662
663 sbuf.resize(len, false); // resize without content preservation
664 bv_file.read((char*) sbuf.data(), std::streamsize(len));
665 if (!bv_file.good())
666 return -1;
667
668 bm::deserialize(bv, sbuf.data());
669
670 return 0;
671}
672
673
674/// read the input collection sequence and dump file, verify correctness
675///
676static
677void verify_inv_dump_file(const std::string& fname,
678 const std::string& bv_in_fname)
679{
680 bm::id64_t total_ints = 0;
681
682
683 bm::chrono_taker tt1(std::cout, "2. Verify collection", 1, &timing_map);
684
685
686 cout << "Reading input collection: " << fname << endl;
687 if (!bv_in_fname.empty())
688 cout << "Reading BV collection: " << bv_in_fname << endl;
689 else
690 cout << "NO BV collection specified" << endl;
691
692
693 vector<unsigned> vec;
694 std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
695 if (!fin.good())
696 {
697 throw std::runtime_error("Cannot open input file");
698 }
699
700 std::ifstream bv_file;
701 std::streamsize fsize = 0;
702 if (!bv_in_fname.empty())
703 {
704 bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
705 if (!bv_file.good())
706 throw std::runtime_error("Cannot open bvect dump file");
707 fin.seekg(0, std::ios::end);
708 fsize = fin.tellg();
709 fin.seekg(0, std::ios::beg);
710 }
711
712
713 // initialize serializer
714 //
715
716 bm::serializer<bm::bvector<> >::buffer sbuf; // resizable memory buffer
717
718 // main loop to read sample vectors
719 //
720 bm::id64_t i;
721 for (i = 0; true; ++i)
722 {
723 int ret = io_read_u32_coll(fin, vec);
724 if (ret != 0)
725 throw std::runtime_error("Error reading input file");
726
727 total_ints += vec.size(); // remember the total size of the collection
728
729 // serialize and save as a bit-vector size0:<BLOB0>, size1:<BLOB1>...N
730 //
731 if (!bv_in_fname.empty())
732 {
733 bm::bvector<> bv;
734 read_bvector(bv_file, bv, sbuf);
735 int cmp = compare_vect(vec, bv);
736 if (cmp != 0)
737 {
738 throw std::runtime_error("Vector comparison failed");
739 }
740 }
741
742 std::streamsize fpos_curr = fin.tellg();
743 if (fpos_curr == fsize)
744 break;
745
746 if (!is_silent)
747 {
748 cout << "\r" << fpos_curr << "/" << fsize
749 << " ( size=" << vec.size() << " ) "
750 << flush;
751 }
752 } // for i
753
754 cout << endl;
755 cout << "Verification complete." << endl;
756 cout << "Total vectors=" << i << endl;
757 cout << "Total ints=" << total_ints << endl;
758}
759
760/// read and decode the compressed dump file
761///
762static
763void decode_test_dump_file(const std::string& bv_in_fname)
764{
765 bm::chrono_taker tt1(std::cout, "3. Decode collection", 1, &timing_map);
766
767 std::ifstream bv_file;
768 std::streamsize fsize;
769 if (!bv_in_fname.empty())
770 {
771 bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
772 if (!bv_file.good())
773 throw std::runtime_error("Cannot open bvect dump file");
774 bv_file.seekg(0, std::ios::end);
775 fsize = bv_file.tellg();
776 bv_file.seekg(0, std::ios::beg);
777 }
778 else
779 {
780 throw std::runtime_error("Cannot open bvect dump file");
781 }
782
783
784 bm::serializer<bm::bvector<> >::buffer sbuf; // resizable memory buffer
785
786 // main loop to read sample vectors
787 //
788 bm::id64_t i;
789 for (i = 0; true; ++i)
790 {
791 // serialize and save as a bit-vector size0:<BLOB0>, size1:<BLOB1>...N
792 //
793 if (!bv_in_fname.empty())
794 {
795 bm::bvector<> bv;
796 read_bvector(bv_file, bv, sbuf);
797 }
798
799 std::streamsize fpos_curr = bv_file.tellg();
800 if (fpos_curr == fsize)
801 break;
802
803 if (!is_silent)
804 {
805 cout << "\r" << fpos_curr << "/" << fsize
806 << flush;
807 }
808 } // for i
809
810 cout << endl;
811 cout << "Decode complete." << endl;
812 cout << "Total vectors=" << i << endl;
813}
814
815
816
817int main(int argc, char *argv[])
818{
819 if (argc < 3)
820 {
821 show_help();
822 return 1;
823 }
824
825 try
826 {
827 auto ret = parse_args(argc, argv);
828 if (ret != 0)
829 return ret;
830
831 if (!u32_in_file.empty())
832 {
833 if (!is_verify)
834 {
835 cout << "Compression" << endl;
837 }
838 else
839 {
840 if (is_verify)
841 {
842 cout << "Verification." << endl;
844 }
845 }
846 }
847 if (is_decode)
848 {
849 cout << "Decode test." << endl;
851 }
852
853
854 if (is_timing) // print all collected timings
855 {
856 std::cout << std::endl << "Timings (ms):" << std::endl;
858 }
859 }
860 catch (std::exception& ex)
861 {
862 std::cerr << "Error:" << ex.what() << std::endl;
863 return 1;
864 }
865
866 return 0;
867}
868
869
870#ifdef _MSC_VER
871#pragma warning( pop )
872#endif
873
874
875
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Algorithms for bvector<> (main include)
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition: bm.h:4153
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
Definition: bm.h:3600
bvector_size_type size_type
Definition: bm.h:121
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:150
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector slices
Bit-vector serialization class.
Definition: bmserial.h:76
void gap_length_serialization(bool value) BMNOEXCEPT
Set GAP length serialization (serializes GAP levels of the original vector)
Definition: bmserial.h:1275
void set_compression_level(unsigned clevel) BMNOEXCEPT
Set compression level.
Definition: bmserial.h:1254
void byte_order_serialization(bool value) BMNOEXCEPT
Set byte-order serialization (for cross platform compatibility)
Definition: bmserial.h:1281
succinct sparse vector with runtime compression using bit-slicing / transposition method
Definition: bmsparsevec.h:87
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
Definition: bmsparsevec.h:572
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
Definition: bmsparsevec.h:2094
@ BM_SORTED
input set is sorted (ascending order)
Definition: bmconst.h:205
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:229
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
Definition: bmserial.h:3140
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
bool is_verify
Definition: inv_list.cpp:95
int main(int argc, char *argv[])
Definition: inv_list.cpp:817
int validate_inp_vec(const VT &vec, typename VT::value_type &min_delta, typename VT::value_type &min_delta_cnt)
Check if input vector is monotonously sorted (true inverted list) along the way in computes a minimal...
Definition: inv_list.cpp:273
unsigned c_level
Definition: inv_list.cpp:99
static int parse_args(int argc, char *argv[])
Definition: inv_list.cpp:103
std::string bv_in_file
Definition: inv_list.cpp:85
std::string u32_out_file
Definition: inv_list.cpp:91
std::string bv_out_file
Definition: inv_list.cpp:86
std::string sv_out_file
Definition: inv_list.cpp:89
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
Definition: inv_list.cpp:238
static int read_bvector(std::ifstream &bv_file, bm::bvector<> &bv, bm::serializer< bm::bvector<> >::buffer &sbuf)
read and desrialize bit-bector from the dump file
Definition: inv_list.cpp:650
static void verify_inv_dump_file(const std::string &fname, const std::string &bv_in_fname)
read the input collection sequence and dump file, verify correctness
Definition: inv_list.cpp:677
std::string u32_in_file
Definition: inv_list.cpp:90
static void show_help()
Definition: inv_list.cpp:62
bool is_super_sparse(const BV &bv)
Debug utility to detect super sparse bit-vectors which probably get bad compression rate.
Definition: inv_list.cpp:325
bm::chrono_taker ::duration_map_type timing_map
Definition: inv_list.cpp:242
static void compress_inv_dump_file(const std::string &fname, const std::string &bv_out_fname, const std::string &sv_out_fname)
read the input collection sequence, write using various compression schemes
Definition: inv_list.cpp:460
int compare_vect(const VT &vec, const BV &bv)
Verification check if integer vector is equivalent to a bit-vector.
Definition: inv_list.cpp:305
bool is_diag
Definition: inv_list.cpp:93
void write_as_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
Definition: inv_list.cpp:366
std::string sv_in_file
Definition: inv_list.cpp:88
void write_as_rsc_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< rsc_sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
Definition: inv_list.cpp:409
int io_read_u32_coll(std::ifstream &fin, VT &vec)
Read 32-bit vector size-prefix format (length:0, 1, 2, 3, ....)
Definition: inv_list.cpp:248
bool write_as_bvector(std::ofstream &bv_file, const VT &vec, bm::serializer< bm::bvector<> > &bvs, bm::serializer< bm::bvector<> >::buffer &sbuf)
convert vector into bit-vector and append to the file
Definition: inv_list.cpp:343
bool is_silent
Definition: inv_list.cpp:96
static void decode_test_dump_file(const std::string &bv_in_fname)
read and decode the compressed dump file
Definition: inv_list.cpp:763
bool is_timing
Definition: inv_list.cpp:94
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
Definition: inv_list.cpp:239
bool is_decode
Definition: inv_list.cpp:97
const unsigned set_compression_default
Default compression level.
Definition: bmserial.h:60
unsigned long long int id64_t
Definition: bmconst.h:35
unsigned int id_t
Definition: bmconst.h:38
layout class for serialization buffer structure
size_t size() const BMNOEXCEPT
return current serialized size
const unsigned char * data() const BMNOEXCEPT
Return serialization buffer pointer.