BitMagic-C++
strsvsample02a.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2021 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example strsvsample02a.cpp
20
21 Example shows how to use optimized bm::str_sparse_vector::compare() functions with std::sort
22 And then use bm::str_sparse_vector::remap() to reduce memory footprint.
23 Succinct methods are very responsive to sort order and other regularities in the data.
24
25 \sa bm::str_sparse_vector
26 \sa bm::str_sparse_vector::const_iterator
27 \sa bm::str_sparse_vector::back_insert_iterator
28 \sa bm::str_sparse_vector::compare
29 \sa bm::str_sparse_vector::remap
30 \sa bm::str_sparse_vector::optimize
31
32*/
33
34/*! \file strsvsample02a.cpp
35 \brief Example: str_sparse_vector<> sort example
36*/
37
38#include <iostream>
39#include <string>
40#include <vector>
41#include <random>
42#include <algorithm>
43
44#include "bm.h"
45#include "bmstrsparsevec.h"
46#include "bmsparsevec_algo.h"
47
48#include "bmtimer.h"
49#include "bmdbg.h"
50
51#include "bmundef.h" /* clear the pre-proc defines from BM */
52
53using namespace std;
54
57
58
59/// generate collection of strings from integers and shuffle it
60///
61static
62void generate_string_set(vector<string>& str_vec,
63 const unsigned max_coll = 250000)
64{
65 str_vec.resize(0);
66 string str;
67 for (unsigned i = 10; i < max_coll; i += unsigned(rand() % 3))
68 {
69 str = to_string(i);
70 str_vec.emplace_back(str);
71 } // for i
72
73 // shuffle the data set
74 //
75 std::random_device rd;
76 std::mt19937 g(rd());
77 std::shuffle(str_vec.begin(), str_vec.end(), g);
78}
79
80int main(void)
81{
82 try
83 {
84 str_sv_type str_sv;
85 std::vector<string> str_vec;
86
87 generate_string_set(str_vec);
88 {
89 auto bit = str_sv.get_back_inserter();
90 for (auto it = str_vec.begin(); it != str_vec.end(); ++it)
91 bit = *it;
92 bit.flush(); // important to avoid descructor exceptions
93 str_sv.optimize();
94 }
95 cout << endl;
96
97 // This approach to sort uses an index vector to perform the sort on
98 // plain uncompressed vector of indexes offers better sort performance
99 // because swap is faster.
100 //
101 // In this example we are swapping index elements, leaving the original
102 // vector in place with an option to copy it later
103 //
104
105 std::vector<uint32_t> index(str_sv.size());
106 std::generate(index.begin(), index.end(),
107 [n = 0] () mutable { return n++; });
108 {{
109 bm::chrono_taker tt(cout, "1.std::sort() of index: ", 0); // timing
110#if (1)
111 // fastest variant uses local cache to keep one of the comparison vars
112 // to reduce access to the succinct vector, right variable gets more hits
113 // due to specifics of sort implementation
114 // (improves comparison performance, alternative variant provided)
115 //
116
117 std::sort(index.begin(), index.end(),
118 [&str_sv](const uint32_t l, const uint32_t r)
119 {
120 static thread_local string last_right_str; // caching variable
121 static thread_local uint32_t last_right = uint32_t(-1);
122 if (last_right != r)
123 {
124 last_right = r;
125 str_sv.get(last_right, last_right_str);
126 }
127 return str_sv.compare(l, last_right_str.c_str()) < 0;
128 } // lambda
129 ); // sort
130#else
131
132 // possible alternative is to use sort with compare (l, r) it is
133 // measurably slower (due to hight latency more access to succint vector)
134 // (code is somewhat simpler)
135
136 std::sort(index.begin(), index.end(),
137 [&str_sv](const uint32_t l, const uint32_t r)
138 { return str_sv.compare(l, r) < 0; } // lambda
139 ); // sort
140#endif
141 }}
142
143 // copy the sorted vector
144 //
145 str_sv_type str_sv_sorted;
146 {
147 char buf[1024]; // value buffer sized for the max.possible len
148 auto bit = str_sv_sorted.get_back_inserter();
149 for (auto it = index.begin(); it != index.end(); ++it)
150 {
151 auto i = *it; // get index
152 str_sv.get(i, buf, sizeof(buf)); // read value by index
153 bit = (const char*)buf; // to back inserter
154 }
155 bit.flush();
156 }
157
158 assert(str_sv_sorted.size()==str_sv.size());
159
160
161 // validate the results to match STL sort
162 //
163 std::sort(str_vec.begin(), str_vec.end());
164 {
165 std::vector<string>::const_iterator sit = str_vec.begin();
166 auto it = str_sv_sorted.begin();
167 auto it_end = str_sv_sorted.end();
168 for (; it != it_end; ++it, ++sit)
169 {
170 string s = *it;
171 if (*sit != s)
172 {
173 cerr << "Mismatch at:" << s << "!=" << *sit << endl;
174 return 1;
175 }
176 } // for
177 }
178 cout << "Sort validation Ok." << endl << endl;
179 cout << "Memory footprint statistics:\n" << endl;
180 //
181 //
182 // compute succinct container stats before remapping
183 str_sv_type::statistics st_sorted;
184 str_sv_sorted.calc_stat(&st_sorted);
185 cout << "SV sorted(before remap) memory_used : "
186 << st_sorted.memory_used << endl;
187
188 // memory remapping turns vector into basically read-only
189 // (some careful modifications are allowed), but often saves memory
190 // due to frequency based succinct codes
191 //
192 str_sv_sorted.remap();
193 str_sv_sorted.optimize();
194
195
197 str_sv.calc_stat(&st);
198 str_sv_sorted.calc_stat(&st_sorted);
199
200 cout << "SV unsorted memory_used : "
201 << st.memory_used << endl;
202
203 cout << "SV sorted(after remap) memory_used : "
204 << st_sorted.memory_used << endl;
205
206 }
207 catch(std::exception& ex)
208 {
209 std::cerr << ex.what() << std::endl;
210 return 1;
211 }
212
213
214 return 0;
215}
216
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for bm::sparse_vector.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal)
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition: bm.h:115
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:41
void flush()
flush the accumulated buffer.
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
void calc_stat(struct str_sparse_vector< CharType, BV, STR_SIZE >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
size_type size() const
return size of the vector
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
int compare(size_type idx, const value_type *str) const BMNOEXCEPT
Compare vector element with argument lexicographically.
void remap()
Build remapping profile and re-load content to save memory.
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
static void generate_string_set(vector< string > &str_vec, const unsigned max_coll=250000)
generate collection of strings from integers and shuffle it
bm::bvector bvector_type
int main(void)
bm::str_sparse_vector< char, bvector_type, 3 > str_sv_type
size_t memory_used
memory usage for all blocks and service tables
Definition: bmfunc.h:62