BitMagic-C++
inv_list.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2002-2020 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example inv_list.cpp
20  Utility to compress test sets of inverted lists (Gov2 corpus)
21  and study compression characteristics and different comression levels
22 */
23 
24 /*! \file inv_list.cpp
25  \brief Utility to compress test sets of inverted lists
26 */
27 
28 #include <iostream>
29 #include <chrono>
30 #include <thread>
31 #include <time.h>
32 #include <stdio.h>
33 #include <cstdlib>
34 
35 
36 #ifdef _MSC_VER
37 #pragma warning( push )
38 #pragma warning( disable : 4996)
39 #endif
40 
41 #include <vector>
42 #include <chrono>
43 #include <map>
44 
45 #include "bm.h"
46 #include "bmalgo.h"
47 #include "bmserial.h"
48 #include "bmsparsevec.h"
49 #include "bmsparsevec_compr.h"
50 #include "bmsparsevec_algo.h"
51 #include "bmsparsevec_serial.h"
52 #include "bmalgo_similarity.h"
53 
54 
55 #include "bmdbg.h"
56 #include "bmtimer.h"
57 #include "bmundef.h" /* clear the pre-proc defines from BM */
58 
59 using namespace std;
60 
61 static
62 void show_help()
63 {
64  std::cout
65  << "BitMagic Inverted List Compression Test (c) 2019" << std::endl
66  << "-u32in u32-input-file -- raw 32-bit unsigned int file" << std::endl
67  << "-bvout bvect-output-file -- bit-vector compressed out file" << std::endl
68  << "-svout svect-output-file -- bit-transposed sparse vectors out file" << std::endl
69  << "-bvin bvect-input-file -- bit-vector compressed in file" << std::endl
70  << std::endl
71  << "-level N -- compression level to use (up to 5)" << std::endl
72  << "-silent (-s) -- no progress print or messages" << std::endl
73  << "-verify -- verify compressed version " << std::endl
74  << "-decode -- run decode test (in-memory)" << std::endl
75  << "-diag (-d) -- print statistics/diagnostics info" << std::endl
76  << "-timing (-t) -- evaluate timing/duration of operations" << std::endl
77  ;
78 }
79 
80 
81 
82 
83 // Arguments
84 //
85 std::string bv_in_file;
86 std::string bv_out_file;
87 
88 std::string sv_in_file;
89 std::string sv_out_file;
90 std::string u32_in_file;
91 std::string u32_out_file;
92 
93 bool is_diag = false;
94 bool is_timing = false;
95 bool is_verify = false;
96 bool is_silent = false;
97 bool is_decode = false;
98 
100 
101 
102 static
103 int parse_args(int argc, char *argv[])
104 {
105  for (int i = 1; i < argc; ++i)
106  {
107  std::string arg = argv[i];
108  if ((arg == "-h") || (arg == "--help"))
109  {
110  show_help();
111  return 0;
112  }
113 
114  if (arg == "-v" || arg == "-verify")
115  {
116  if (i + 1 < argc)
117  {
118  is_verify = true;
119  }
120  continue;
121  }
122  if (arg == "-decode")
123  {
124  if (i + 1 < argc)
125  {
126  is_decode = true;
127  }
128  continue;
129  }
130  if (arg == "-l" || arg == "-level")
131  {
132  if (i + 1 < argc)
133  {
134  const char* lvl = argv[++i];
135  char *end;
136  c_level = (unsigned) std::strtoul(lvl, &end, 10);
137  if (errno == ERANGE)
138  {
139  std::cerr << "Error parsing -level: range error for "
140  << lvl<< std::endl;
141  return 1;
142  }
143  }
144  else
145  {
146  std::cerr << "Error: -level requires compression level number" << std::endl;
147  return 1;
148  }
149  continue;
150  }
151 
152  if (arg == "-bvout" || arg == "--bvout")
153  {
154  if (i + 1 < argc)
155  {
156  bv_out_file = argv[++i];
157  }
158  else
159  {
160  std::cerr << "Error: -bvout requires file name" << std::endl;
161  return 1;
162  }
163  continue;
164  }
165  if (arg == "-bvin" || arg == "--bvin")
166  {
167  if (i + 1 < argc)
168  {
169  bv_in_file = argv[++i];
170  }
171  else
172  {
173  std::cerr << "Error: -bvin requires file name" << std::endl;
174  return 1;
175  }
176  continue;
177  }
178 
179  if (arg == "-svin" || arg == "--svin")
180  {
181  if (i + 1 < argc)
182  {
183  sv_in_file = argv[++i];
184  }
185  else
186  {
187  std::cerr << "Error: -svin requires file name" << std::endl;
188  return 1;
189  }
190  continue;
191  }
192 
193  if (arg == "-u32in" || arg == "--u32in")
194  {
195  if (i + 1 < argc)
196  {
197  u32_in_file = argv[++i];
198  }
199  else
200  {
201  std::cerr << "Error: -u32in requires file name" << std::endl;
202  return 1;
203  }
204  continue;
205  }
206 
207  if (arg == "-svout" || arg == "--svout")
208  {
209  if (i + 1 < argc)
210  {
211  sv_out_file = argv[++i];
212  }
213  else
214  {
215  std::cerr << "Error: -svout requires file name" << std::endl;
216  return 1;
217  }
218  continue;
219  }
220 
221 
222  if (arg == "-silent" || arg == "--silent" || arg == "-s" || arg == "--s")
223  is_silent = true;
224  if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
225  is_diag = true;
226  if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
227  is_timing = true;
228 
229 
230  } // for i
231  return 0;
232 }
233 
234 
235 // Globals
236 //
237 
240 
241 
243 
244 
245 /// Read 32-bit vector size-prefix format (length:0, 1, 2, 3, ....)
246 ///
247 template<class VT>
248 int io_read_u32_coll(std::ifstream& fin, VT& vec)
249 {
250  typedef typename VT::value_type value_type;
251  vec.resize(0);
252  if (!fin.good())
253  return -1;
254  value_type len;
255  fin.read((char*) &len, std::streamsize(sizeof(len)));
256  if (!fin.good())
257  return -1;
258  if (!len)
259  return -2; // 0-len detected (broken file)
260 
261  vec.resize(len);
262  fin.read((char*) &vec[0], std::streamsize(len*sizeof(value_type)));
263  if (!fin.good())
264  return -1;
265 
266  return 0;
267 }
268 
269 /// Check if input vector is monotonously sorted (true inverted list)
270 /// along the way in computes a minimal delta between values
271 ///
272 template<typename VT>
273 int validate_inp_vec(const VT& vec,
274  typename VT::value_type& min_delta,
275  typename VT::value_type& min_delta_cnt
276  )
277 {
278  typename VT::value_type md_cnt = min_delta_cnt = 0;
279  auto sz = vec.size();
280  if (!sz)
281  return -1;
282  auto i_prev = vec[0];
283  typename VT::value_type md = ~(typename VT::value_type(0)); // max possible uint
284  for (typename VT::size_type i = 1; i < sz; ++i)
285  {
286  auto val = vec[i];
287  if (val <= i_prev)
288  return -2;
289  typename VT::value_type d1 = val - i_prev;
290  if (d1 < md)
291  {
292  md_cnt = 0; md = d1;
293  }
294  md_cnt += (d1 == md);
295  i_prev = val;
296  }
297  min_delta = md;
298  min_delta_cnt = md_cnt;
299  return 0;
300 }
301 
302 /// Verification check if integer vector is equivalent to a bit-vector
303 ///
304 template<typename VT, typename BV>
305 int compare_vect(const VT& vec, const BV& bv)
306 {
307  if (vec.size() != bv.count())
308  {
309  return -1;
310  }
311  typename BV::enumerator en = bv.first();
312  typename VT::const_iterator it = vec.begin();
313  for (; en.valid(); ++en, ++it)
314  {
315  if (*en != *it)
316  return -1;
317  }
318  return 0;
319 }
320 
321 /// Debug utility to detect super sparse bit-vectors
322 /// which probably get bad compression rate
323 ///
324 template<typename BV>
325 bool is_super_sparse(const BV& bv)
326 {
327  typename BV::statistics st;
328  bv.calc_stat(&st);
329  auto bc = bv.count();
330  auto blocks_count = st.gap_blocks + st.bit_blocks;
331  if (bc <= blocks_count)
332  return true;
333  auto bc_parity = blocks_count * 6;
334  return (bc <= bc_parity);
335 }
336 
337 
338 /// convert vector into bit-vector and append to the file
339 ///
340 /// @return true if vector was detected as very low cardinality
341 ///
342 template<typename VT>
343 bool write_as_bvector(std::ofstream& bv_file,
344  const VT& vec,
346  bm::serializer<bm::bvector<> >::buffer& sbuf)
347 {
349  bm::bvector<> bv;
350  bv.set(&vec[0], bm::bvector<>::size_type(vec.size()), bm::BM_SORTED);
351 
352  bv.optimize(tb);
353  bvs.serialize(bv, sbuf);
354 
355  unsigned bv_size = (unsigned)sbuf.size();
356  bv_file.write((char*)&bv_size, sizeof(bv_size));
357  bv_file.write((char*)sbuf.data(), (std::streamsize)sbuf.size());
358  if (!bv_file.good())
359  throw std::runtime_error("Error write to bvect out file");
360  return false;
361 }
362 
363 /// convert vector into delta coded bit-transposed vector and append to the file
364 ///
365 template<typename VT>
366 void write_as_svector(std::ofstream& sv_file,
367  const VT& vec,
368  unsigned min_delta,
370 {
371 
373  bm::id_t prev = vec[0];
374 
375  {
377  sv_bi.add(min_delta);
378  sv_bi.add(prev);
379 
380  for (unsigned k = 1; k < vec.size(); ++k)
381  {
382  bm::id_t curr = vec[k];
383  bm::id_t delta = curr - prev;
384  if (delta < min_delta)
385  throw std::runtime_error("Input vector validation delta error");
386  delta -= min_delta;
387  sv_bi.add(delta);
388  prev = curr;
389  } // for i
390  sv_bi.flush();
391  }
392 
394  sv.optimize(tb);
395  bm::sparse_vector_serialize(sv, sv_lay, tb);
396 
397  unsigned sv_size = (unsigned)sv_lay.size();
398  sv_file.write((char*)&sv_size, sizeof(sv_size));
399  sv_file.write((char*)sv_lay.data(), (std::streamsize)sv_lay.size());
400  if (!sv_file.good())
401  throw std::runtime_error("Error write to bvect out file");
402 
403 }
404 
405 
406 /// convert vector into delta coded bit-transposed vector and append to the file
407 ///
408 template<typename VT>
409 void write_as_rsc_svector(std::ofstream& sv_file,
410  const VT& vec,
411  unsigned min_delta,
413 {
414 
416  bm::id_t prev = vec[0];
417 
418  {
420  sv_bi.add(min_delta);
421  sv_bi.add(prev);
422 
423  for (unsigned k = 1; k < vec.size(); ++k)
424  {
425  bm::id_t curr = vec[k];
426  bm::id_t delta = curr - prev;
427  if (delta < min_delta)
428  throw std::runtime_error("Input vector validation delta error");
429  delta -= min_delta;
430  if (delta)
431  sv_bi.add(delta);
432  else
433  sv_bi.add_null();
434  prev = curr;
435  } // for i
436  sv_bi.flush();
437  }
438 
440 
441  rsc_sparse_vector_u32 csv; // compressed sparse vector
442  csv.load_from(sv); // load rank-select-compacted (rsc) sparse vector
443  csv.optimize(tb);
444 
445  bm::sparse_vector_serialize(csv, sv_lay, tb);
446 
447  unsigned sv_size = (unsigned)sv_lay.size();
448  sv_file.write((char*)&sv_size, sizeof(sv_size));
449  sv_file.write((char*)sv_lay.data(), (std::streamsize)sv_lay.size());
450  if (!sv_file.good())
451  throw std::runtime_error("Error write to bvect out file");
452 
453 }
454 
455 
456 
457 /// read the input collection sequence, write using various compression schemes
458 ///
459 static
460 void compress_inv_dump_file(const std::string& fname,
461  const std::string& bv_out_fname,
462  const std::string& sv_out_fname)
463 {
464  bm::id64_t total_ints = 0;
465  bm::id64_t total_low_card = 0;
466  bm::id64_t total_low_card_size = 0;
467  bm::id64_t min_delta_ints = 0;
468  bm::id64_t sv_size = 0;
469  bm::id64_t rsc_diff_size = 0;
470  bm::id64_t sv_cnt = 0;
471 
472  cout << "Reading input collection: " << fname << endl;
473  if (!bv_out_fname.empty())
474  cout << "Writing to BV collection: " << bv_out_fname << endl;
475  else
476  cout << "NO BV collection specified" << endl;
477  if (!sv_out_fname.empty())
478  cout << "Writing to SV collection: " << sv_out_fname << endl;
479  else
480  cout << "NO SV collection specified" << endl;
481 
482 
483  cout << "Compression level: " << c_level << endl;
484 
485  bm::chrono_taker tt1("1. Convert collection", 1, &timing_map);
486 
487  vector<unsigned> vec;
488  std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
489  if (!fin.good())
490  {
491  throw std::runtime_error("Cannot open input file");
492  }
493 
494  std::ofstream bv_file;
495  if (!bv_out_fname.empty())
496  {
497  bv_file.open(bv_out_fname, std::ios::out | std::ios::binary);
498  if (!bv_file.good())
499  throw std::runtime_error("Cannot open bvect out file");
500  }
501  std::ofstream sv_file;
502  if (!sv_out_fname.empty())
503  {
504  sv_file.open(sv_out_fname, std::ios::out | std::ios::binary);
505  if (!sv_file.good())
506  throw std::runtime_error("Cannot open svect out file");
507  }
508 
509 
510  fin.seekg(0, std::ios::end);
511  std::streamsize fsize = fin.tellg();
512 
513  fin.seekg(0, std::ios::beg);
514 
515  // initialize serializer
516  //
518  bvs.byte_order_serialization(false);
519  bvs.gap_length_serialization(false);
521 
522  // serialization target for sparse vector
525 
526  bm::serializer<bm::bvector<> >::buffer sbuf; // resizable memory buffer
527 
528  // main loop to read sample vectors
529  //
530  bm::id64_t i;
531  for (i = 0; true; ++i)
532  {
533  int ret = io_read_u32_coll(fin, vec);
534  if (ret != 0)
535  throw std::runtime_error("Error reading input file");
536  unsigned min_delta, min_delta_cnt;
537 
538  {
539  ret = validate_inp_vec(vec, min_delta, min_delta_cnt); // check if we have sorted unique set
540  if (ret != 0)
541  throw std::runtime_error("Input vector validation failed");
542  if (!min_delta || !min_delta_cnt)
543  throw std::runtime_error("Input vector validation delta error");
544 
545  min_delta_ints += min_delta_cnt;
546  }
547  total_ints += vec.size(); // remember the total size of the collection
548 
549  // serialize and save as a bit-vector size0:<BLOB0>, size1:<BLOB1>...N
550  //
551  bool is_low_card = false;
552  if (!bv_out_fname.empty())
553  {
554  is_low_card = write_as_bvector(bv_file, vec, bvs, sbuf);
555  if (is_low_card)
556  {
557  total_low_card_size += sbuf.size();
558  ++total_low_card;
559  }
560  }
561 
562  // commented out experimental (and very slow code) to evaluate rank-select compression
563  #if 0
564  if (!sv_out_fname.empty())
565  {
566  //write_as_svector(sv_file, vec, min_delta, sv_lay);
567  write_as_rsc_svector(sv_file, vec, min_delta, csv_lay);
568  sv_size += sv_lay.size();
569 
570  /*
571  rsc_sparse_vector_u32 csv; // compressed sparse vector
572  csv.load_from(sv); // load rank-select-compacted (rsc) sparse vector
573 
574  BM_DECLARE_TEMP_BLOCK(tb)
575  csv.optimize(tb);
576  bm::sparse_vector_serial_layout<rsc_sparse_vector_u32> sv_lay;
577  bm::sparse_vector_serialize(csv, sv_lay, tb);
578  if (sv_lay.size() < sbuf.size())
579  {
580  rsc_diff = sbuf.size() - sv_lay.size();
581  rsc_diff_size += rsc_diff;
582  sv_size += sv_lay.size();
583  sv_cnt++;
584  }
585  */
586  }
587  #endif
588 
589 
590  std::streamsize fpos_curr = fin.tellg();
591  if (fpos_curr == fsize)
592  break;
593 
594  if (!is_silent)
595  {
596  cout << "\r" << i << " " << fpos_curr << " / " << fsize
597  << " ( size=" << vec.size() << " ) " << (is_low_card ? " * " : " ")
598  << " sv=" << sv_cnt << " rsc_diff=" << rsc_diff_size
599  << flush;
600  }
601 
602  } // for i
603 
604  // print statistics about test set
605 
606  cout << endl;
607  cout << "Total vectors=" << i << endl;
608  cout << " lo-card=" << total_low_card << endl;
609  cout << " lo-card size = " << total_low_card_size << endl;
610  cout << " SV cnt = " << sv_cnt << endl;
611  cout << " SV size = " << sv_size << endl;
612  cout << " RSC diff = " << rsc_diff_size << endl;
613  cout << "Total ints=" << total_ints << endl;
614  cout << " min-deltas = " << min_delta_ints << endl;
615  {
616  double min_delta_ratio = double(min_delta_ints) / double(total_ints);
617  cout << " min delta ratio = " << std::setprecision(3) << min_delta_ratio << endl;
618  }
619  if (!bv_out_fname.empty())
620  {
621  bm::id64_t bv_size = (bm::id64_t)bv_file.tellp();
622  cout << "BV size = " << bv_size << endl;
623  // calculate bits per int compression ratio corrected not to account
624  // for size/length words prefixing the vectors
625  double bv_bits_per_int = double(bv_size * 8ull - (i*sizeof(unsigned))) / double(total_ints);
626  cout << "BV Bits per/int = " << std::setprecision(3) << bv_bits_per_int << endl;
627 
628  bv_file.close();
629  }
630 
631  if (!sv_out_fname.empty())
632  {
633  sv_size = (bm::id64_t)sv_file.tellp();
634  cout << "SV size = " << sv_size << endl;
635  // calculate bits per int compression ratio corrected not to account
636  // for size/length words prefixing the vectors
637  double sv_bits_per_int = double(sv_size * 8ull - (i*sizeof(unsigned))) / double(total_ints);
638  cout << "SV Bits per/int = " << std::setprecision(3) << sv_bits_per_int << endl;
639 
640  sv_file.close();
641  }
642 
643 }
644 
645 // ------------------------------------------------------------------------
646 
647 /// read and desrialize bit-bector from the dump file
648 ///
649 static
650 int read_bvector(std::ifstream& bv_file,
651  bm::bvector<>& bv,
652  bm::serializer<bm::bvector<> >::buffer& sbuf)
653 {
654  if (!bv_file.good())
655  return -1;
656  unsigned len;
657  bv_file.read((char*) &len, std::streamsize(sizeof(len)));
658  if (!bv_file.good())
659  return -1;
660  if (!len)
661  return -2; // 0-len detected (broken file)
662 
663  sbuf.resize(len, false); // resize without content preservation
664  bv_file.read((char*) sbuf.data(), std::streamsize(len));
665  if (!bv_file.good())
666  return -1;
667 
668  bm::deserialize(bv, sbuf.data());
669 
670  return 0;
671 }
672 
673 
674 /// read the input collection sequence and dump file, verify correctness
675 ///
676 static
677 void verify_inv_dump_file(const std::string& fname,
678  const std::string& bv_in_fname)
679 {
680  bm::id64_t total_ints = 0;
681 
682 
683  bm::chrono_taker tt1("2. Verify collection", 1, &timing_map);
684 
685 
686  cout << "Reading input collection: " << fname << endl;
687  if (!bv_in_fname.empty())
688  cout << "Reading BV collection: " << bv_in_fname << endl;
689  else
690  cout << "NO BV collection specified" << endl;
691 
692 
693  vector<unsigned> vec;
694  std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
695  if (!fin.good())
696  {
697  throw std::runtime_error("Cannot open input file");
698  }
699 
700  std::ifstream bv_file;
701  std::streamsize fsize;
702  if (!bv_in_fname.empty())
703  {
704  bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
705  if (!bv_file.good())
706  throw std::runtime_error("Cannot open bvect dump file");
707  fin.seekg(0, std::ios::end);
708  fsize = fin.tellg();
709  fin.seekg(0, std::ios::beg);
710  }
711 
712 
713  // initialize serializer
714  //
715 
716  bm::serializer<bm::bvector<> >::buffer sbuf; // resizable memory buffer
717 
718  // main loop to read sample vectors
719  //
720  bm::id64_t i;
721  for (i = 0; true; ++i)
722  {
723  int ret = io_read_u32_coll(fin, vec);
724  if (ret != 0)
725  throw std::runtime_error("Error reading input file");
726 
727  total_ints += vec.size(); // remember the total size of the collection
728 
729  // serialize and save as a bit-vector size0:<BLOB0>, size1:<BLOB1>...N
730  //
731  if (!bv_in_fname.empty())
732  {
733  bm::bvector<> bv;
734  read_bvector(bv_file, bv, sbuf);
735  int cmp = compare_vect(vec, bv);
736  if (cmp != 0)
737  {
738  throw std::runtime_error("Vector comparison failed");
739  }
740  }
741 
742  std::streamsize fpos_curr = fin.tellg();
743  if (fpos_curr == fsize)
744  break;
745 
746  if (!is_silent)
747  {
748  cout << "\r" << fpos_curr << "/" << fsize
749  << " ( size=" << vec.size() << " ) "
750  << flush;
751  }
752  } // for i
753 
754  cout << endl;
755  cout << "Verification complete." << endl;
756  cout << "Total vectors=" << i << endl;
757  cout << "Total ints=" << total_ints << endl;
758 }
759 
760 /// read and decode the compressed dump file
761 ///
762 static
763 void decode_test_dump_file(const std::string& bv_in_fname)
764 {
765  bm::chrono_taker tt1("3. Decode collection", 1, &timing_map);
766 
767  std::ifstream bv_file;
768  std::streamsize fsize;
769  if (!bv_in_fname.empty())
770  {
771  bv_file.open(bv_in_fname, std::ios::in | std::ios::binary);
772  if (!bv_file.good())
773  throw std::runtime_error("Cannot open bvect dump file");
774  bv_file.seekg(0, std::ios::end);
775  fsize = bv_file.tellg();
776  bv_file.seekg(0, std::ios::beg);
777  }
778  else
779  {
780  throw std::runtime_error("Cannot open bvect dump file");
781  }
782 
783 
784  bm::serializer<bm::bvector<> >::buffer sbuf; // resizable memory buffer
785 
786  // main loop to read sample vectors
787  //
788  bm::id64_t i;
789  for (i = 0; true; ++i)
790  {
791  // serialize and save as a bit-vector size0:<BLOB0>, size1:<BLOB1>...N
792  //
793  if (!bv_in_fname.empty())
794  {
795  bm::bvector<> bv;
796  read_bvector(bv_file, bv, sbuf);
797  }
798 
799  std::streamsize fpos_curr = bv_file.tellg();
800  if (fpos_curr == fsize)
801  break;
802 
803  if (!is_silent)
804  {
805  cout << "\r" << fpos_curr << "/" << fsize
806  << flush;
807  }
808  } // for i
809 
810  cout << endl;
811  cout << "Decode complete." << endl;
812  cout << "Total vectors=" << i << endl;
813 }
814 
815 
816 
817 int main(int argc, char *argv[])
818 {
819  if (argc < 3)
820  {
821  show_help();
822  return 1;
823  }
824 
825  try
826  {
827  auto ret = parse_args(argc, argv);
828  if (ret != 0)
829  return ret;
830 
831  if (!u32_in_file.empty())
832  {
833  if (!is_verify)
834  {
835  cout << "Compression" << endl;
837  }
838  else
839  {
840  if (is_verify)
841  {
842  cout << "Verification." << endl;
844  }
845  }
846  }
847  if (is_decode)
848  {
849  cout << "Decode test." << endl;
851  }
852 
853 
854  if (is_timing) // print all collected timings
855  {
856  std::cout << std::endl << "Timings (ms):" << std::endl;
858  }
859  }
860  catch (std::exception& ex)
861  {
862  std::cerr << "Error:" << ex.what() << std::endl;
863  return 1;
864  }
865 
866  return 0;
867 }
868 
869 
870 #ifdef _MSC_VER
871 #pragma warning( pop )
872 #endif
873 
874 
875 
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
static void show_help()
Definition: inv_list.cpp:62
size_t size() const
return current serialized size
void gap_length_serialization(bool value) BMNOEXCEPT
Set GAP length serialization (serializes GAP levels of the original vector)
Definition: bmserial.h:1213
static void verify_inv_dump_file(const std::string &fname, const std::string &bv_in_fname)
read the input collection sequence and dump file, verify correctness
Definition: inv_list.cpp:677
static int read_bvector(std::ifstream &bv_file, bm::bvector<> &bv, bm::serializer< bm::bvector<> >::buffer &sbuf)
read and desrialize bit-bector from the dump file
Definition: inv_list.cpp:650
bm::chrono_taker::duration_map_type timing_map
Definition: inv_list.cpp:242
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector plains
Definition: bmsparsevec.h:1778
unsigned long long int id64_t
Definition: bmconst.h:34
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector plains
bm::id_t size_type
Definition: bm.h:117
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector&#39;s memory allocation.
Definition: bm.h:3095
Timing utilities for benchmarking (internal)
bool is_silent
Definition: inv_list.cpp:96
int compare_vect(const VT &vec, const BV &bv)
Verification check if integer vector is equivalent to a bit-vector.
Definition: inv_list.cpp:305
bool write_as_bvector(std::ofstream &bv_file, const VT &vec, bm::serializer< bm::bvector<> > &bvs, bm::serializer< bm::bvector<> >::buffer &sbuf)
convert vector into bit-vector and append to the file
Definition: inv_list.cpp:343
pre-processor un-defines to avoid global space pollution (internal)
Bit-vector serialization class.
Definition: bmserial.h:75
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
bool is_verify
Definition: inv_list.cpp:95
std::string u32_out_file
Definition: inv_list.cpp:91
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Algorithms for bvector<> (main include)
sparse vector with runtime compression using bit transposition method
Definition: bmsparsevec.h:81
bool is_super_sparse(const BV &bv)
Debug utility to detect super sparse bit-vectors which probably get bad compression rate...
Definition: inv_list.cpp:325
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
Definition: inv_list.cpp:238
input set is sorted (ascending order)
Definition: bmconst.h:191
void set_compression_level(unsigned clevel) BMNOEXCEPT
Set compression level.
Definition: bmserial.h:1192
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
Definition: bm.h:3605
bool is_decode
Definition: inv_list.cpp:97
bool is_diag
Definition: inv_list.cpp:93
int validate_inp_vec(const VT &vec, typename VT::value_type &min_delta, typename VT::value_type &min_delta_cnt)
Check if input vector is monotonously sorted (true inverted list) along the way in computes a minimal...
Definition: inv_list.cpp:273
static void decode_test_dump_file(const std::string &bv_in_fname)
read and decode the compressed dump file
Definition: inv_list.cpp:763
const unsigned char * data() const
Return serialization buffer pointer.
support "non-assigned" or "NULL" logic
Definition: bmconst.h:215
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
std::string u32_in_file
Definition: inv_list.cpp:90
void byte_order_serialization(bool value) BMNOEXCEPT
Set byte-order serialization (for cross platform compatibility)
Definition: bmserial.h:1219
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:146
const unsigned set_compression_default
Default compression level.
Definition: bmserial.h:60
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
Definition: inv_list.cpp:239
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster...
Definition: bmsparsevec.h:495
void write_as_rsc_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< rsc_sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
Definition: inv_list.cpp:409
Serialization for sparse_vector<>
static int parse_args(int argc, char *argv[])
Definition: inv_list.cpp:103
bool is_timing
Definition: inv_list.cpp:94
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
std::string bv_out_file
Definition: inv_list.cpp:86
void write_as_svector(std::ofstream &sv_file, const VT &vec, unsigned min_delta, bm::sparse_vector_serial_layout< sparse_vector_u32 > &sv_lay)
convert vector into delta coded bit-transposed vector and append to the file
Definition: inv_list.cpp:366
std::string sv_out_file
Definition: inv_list.cpp:89
std::string bv_in_file
Definition: inv_list.cpp:85
int main(int argc, char *argv[])
Definition: inv_list.cpp:817
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs...
layout class for serialization buffer structure
unsigned c_level
Definition: inv_list.cpp:99
unsigned int id_t
Definition: bmconst.h:37
int io_read_u32_coll(std::ifstream &fin, VT &vec)
Read 32-bit vector size-prefix format (length:0, 1, 2, 3, ....)
Definition: inv_list.cpp:248
static void compress_inv_dump_file(const std::string &fname, const std::string &bv_out_fname, const std::string &sv_out_fname)
read the input collection sequence, write using various compression schemes
Definition: inv_list.cpp:460
Compressed sparse container rsc_sparse_vector<> for integer types.
std::string sv_in_file
Definition: inv_list.cpp:88
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
Definition: bmserial.h:2926
Algorithms for bm::sparse_vector.