BitMagic-C++
xsample02.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example xsample02.cpp
20  Counting sort using bit-vector and sparse vector to build histogram of unsigned ints.
21  Benchmark compares different histogram buiding techniques using BitMagic and std::sort()
22 
23  Histogram construction, based on integer events is a common problem,
24  this demo studies different approaches, potential for parallelization and other
25  aspects.
26 */
27 
28 /*! \file xsample02.cpp
29  \brief Example: sparse_vector<> used for counting sort / historgam construction
30 */
31 
32 
33 #include <iostream>
34 #include <memory>
35 #include <map>
36 #include <vector>
37 #include <chrono>
38 #include <algorithm>
39 #include <random>
40 #include <stdexcept>
41 
42 #include <future>
43 #include <thread>
44 
45 #include "bm.h"
46 #include "bmtimer.h"
47 #include "bmsparsevec.h"
48 
49 
50 // ----------------------------------------------------
51 // Global parameters and types
52 // ----------------------------------------------------
53 
54 const unsigned value_max = 1250000; // range of variants of events [0..max]
55 const unsigned test_size = 250000000; // number of events (ints) to generate
56 
57 // -------------------------------------------
58 // Random generator
59 // -------------------------------------------
60 
61 std::random_device rand_dev;
62 std::mt19937 gen(rand_dev());
63 std::uniform_int_distribution<> rand_dis(1, value_max); // generate uniform numebrs for [1, vector_max]
64 
65 
67 typedef std::map<unsigned, unsigned> map_u32;
68 
69 
70 // timing storage for benchmarking
72 
73 
74 // -------------------------------------------
75 // Counting sort / histogram construction (std::map)
76 // -------------------------------------------
77 
78 static
79 void sort_map(map_u32& hmap, const std::vector<unsigned>& vin)
80 {
81  for (auto v : vin)
82  {
83  hmap[v]++;
84  }
85 }
86 
87 
88 // -------------------------------------------
89 // Counting sort / histogram construction (naive)
90 // -------------------------------------------
91 
92 // This sorting method uses sparse_vector<> as a storage but implements increment
93 // as an get-inc-put operations (decoding-encoding every value in the sum)
94 //
95 static
96 void counting_sort_naive(sparse_vector_u32& sv_out, const std::vector<unsigned>& vin)
97 {
98  for (auto v : vin)
99  {
100  auto count = sv_out.get(v);
101  sv_out.set(v, count + 1);
102  }
103 }
104 
105 // -------------------------------------------
106 // Counting sort / histogram construction
107 // -------------------------------------------
108 
109 // This sorting method uses sparse_vector<> as a storage but implements increment
110 // but increment was implemented as a bm::sparse_vector::inc() method
111 // which in turn is based on uses bvector<>::inc()
112 //
113 // This approach is faster than decoding-encoding used in naive counting sort
114 //
115 static
116 void counting_sort(sparse_vector_u32& sv_out, const std::vector<unsigned>& vin)
117 {
118  for(auto v : vin)
119  sv_out.inc(v);
120 }
121 
122 // --------------------------------------------------
123 // Counting sort / histogram construction (parallel)
124 // --------------------------------------------------
125 
126 // parallel subproblem for all even numbers: (v & 1) == 0
127 inline
128 unsigned counting_sort_subbatch(sparse_vector_u32* sv_out, const std::vector<unsigned>* vin)
129 {
130  for (size_t i = 0; i < vin->size(); i++)
131  {
132  auto v = (*vin)[i];
133  if ((v & 1) == 0)
134  sv_out->inc(v);
135  }
136  return 0;
137 }
138 
139 // Parallel histogram construction uses a very simple divide and conquer technique
140 // splitting by even/odd numbers, uses std::async() for parallelization
141 //
142 // (should be possible to do a lot better than that)
143 //
144 static
145 void counting_sort_parallel(sparse_vector_u32& sv_out, const std::vector<unsigned>& vin)
146 {
148  // process evens in parallel
149  std::future<unsigned> f1 = std::async(std::launch::async, counting_sort_subbatch, &sv_out2, &vin);
150 
151  // process all odd elements
152  for (size_t i = 0; i < vin.size(); i++)
153  {
154  auto v = vin[i];
155  if (v & 1)
156  sv_out.inc(v);
157  }
158  f1.wait();
159 
160  // merge effectively performs logical OR on all plains, only it
161  // borrows memory blocks from the argument vector, so it gets changed
162  // (which is ok here, since sv_out2 is a temporary)
163  //
164  sv_out.merge(sv_out2);
165 }
166 
167 // Test utility. It also illustrates histogram access method
168 //
169 static
171 {
174 
175  for (; en.valid(); ++en)
176  {
177  unsigned v = *en;
178  unsigned cnt = sv.get(v);
179  for (unsigned j = 0; j < cnt; ++j)
180  {
181  std::cout << v << ", ";
182  } // for
183  } // for en
184  std::cout << std::endl;
185 }
186 
187 // Test utility for std::map
188 //
189 static
190 void print_sorted(const map_u32& hmap)
191 {
192  map_u32::const_iterator it = hmap.begin();
193  map_u32::const_iterator it_end = hmap.end();
194 
195  for (; it != it_end; ++it)
196  {
197  unsigned v = it->first;
198  unsigned cnt = it->second;
199  for (unsigned j = 0; j < cnt; ++j)
200  {
201  std::cout << v << ", ";
202  } // for
203  } // for en
204  std::cout << std::endl;
205 }
206 
207 
208 // build histogram using sorted vector
209 //
210 static
211 void build_histogram(sparse_vector_u32& sv_out, const std::vector<unsigned>& vin)
212 {
213  if (vin.empty())
214  return;
215  unsigned start = vin[0];
216  unsigned count = 0; // histogram counter
217  for (auto v : vin)
218  {
219  if (v == start)
220  {
221  ++count;
222  }
223  else
224  {
225  sv_out.set(start, count);
226  start = v; count = 1;
227  }
228  }
229  if (count)
230  {
231  sv_out.set(start, count);
232  }
233 }
234 
235 
236 
237 
238 int main(void)
239 {
240  try
241  {
242  // try simple input vector as a model
243  //
244  {
245  std::vector<unsigned> v {10, 1, 5, 4, 8, 8, 8} ;
246  sparse_vector_u32 r_sv(bm::use_null); // result vector
247 
248  counting_sort(r_sv, v);
249 
250  print_sorted(r_sv); // 1, 4, 5, 8, 8, 8, 10,
251 
253  counting_sort_parallel(p_sv, v);
254  print_sorted(r_sv);
255 
256  map_u32 h_map;
257  sort_map(h_map, v);
258  print_sorted(h_map);
259 
260 
261  std::sort(v.begin(), v.end());
262  sparse_vector_u32 h_sv(bm::use_null); // histogram vector
263  build_histogram(h_sv, v);
264  if (!r_sv.equal(h_sv))
265  {
266  std::cerr << "Error: Histogram comparison failed!" << std::endl;
267  print_sorted(h_sv);
268  return 1;
269  }
270 
271  }
272 
273  // run benchmarks
274  //
275  std::vector<unsigned> v;
276 
277  // generate vector of random numbers
278  for (unsigned i = 0; i < test_size; ++i)
279  {
280  v.push_back(unsigned(rand_dis(gen)));
281  }
282  std::cout << "test vector generation ok" << std::endl;
283 
288  map_u32 h_map;
289 
290  {
291  bm::chrono_taker tt1("1. counting sort ", 1, &timing_map);
292  counting_sort(r_sv, v);
293  }
294 
295  {
296  bm::chrono_taker tt1("3. counting sort (naive) ", 1, &timing_map);
297  counting_sort_naive(n_sv, v);
298  }
299 
300  {
301  bm::chrono_taker tt1("4. counting sort (parallel) ", 1, &timing_map);
302  counting_sort_parallel(p_sv, v);
303  }
304 
305  {
306  bm::chrono_taker tt1("5. counting sort (map) ", 1, &timing_map);
307  sort_map(h_map, v);
308  }
309 
310  {
311  bm::chrono_taker tt1("2. std::sort() + histogram", 1, &timing_map);
312  std::sort(v.begin(), v.end());
313  build_histogram(h_sv, v);
314  }
315 
316 
317  // quality assurance checks
318  //
319  if (!r_sv.equal(h_sv) || !n_sv.equal(h_sv))
320  {
321  std::cerr << "Error: Histogram comparison failed!" << std::endl;
322  return 1;
323  }
324  if (!r_sv.equal(p_sv))
325  {
326  std::cerr << "Error: Histogram comparison failed for parallel sort!" << std::endl;
327  return 1;
328  }
329 
330  // compute memory consumption of sparse vector
331  {
332  std::cout << std::endl;
333 
335  sparse_vector_u32::statistics st;
337 
338  std::cout << "Sparse vector memory usage:" << st.memory_used / (1024*1024)<< "MB" << std::endl;
339  std::cout << "vector<unsigned> usage:" << v.size() * sizeof(v[0]) / (1024 * 1024) << "MB" << std::endl << std::endl;
340  }
341 
342 
344 
345  }
346  catch(std::exception& ex)
347  {
348  std::cerr << ex.what() << std::endl;
349  return 1;
350  }
351 
352  return 0;
353 }
354 
main
int main(void)
Definition: xsample02.cpp:238
bm::sparse_vector::optimize
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector plains
Definition: bmsparsevec.h:1750
print_sorted
static void print_sorted(const sparse_vector_u32 &sv)
Definition: xsample02.cpp:170
bm::sparse_vector
sparse vector with runtime compression using bit transposition method
Definition: bmsparsevec.h:81
bm::chrono_taker
Utility class to collect performance measurements and statistics.
Definition: bmtimer.h:39
timing_map
bm::chrono_taker::duration_map_type timing_map
Definition: xsample02.cpp:71
BM_DECLARE_TEMP_BLOCK
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
value_max
const unsigned value_max
Definition: xsample02.cpp:54
bm::bvector<>::enumerator
friend class enumerator
Definition: bm.h:823
bmsparsevec.h
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
build_histogram
static void build_histogram(sparse_vector_u32 &sv_out, const std::vector< unsigned > &vin)
Definition: xsample02.cpp:211
sparse_vector_u32
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
Definition: xsample02.cpp:66
bm::sparse_vector::equal
bool equal(const sparse_vector< Val, BV > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
Definition: bmsparsevec.h:1912
counting_sort
static void counting_sort(sparse_vector_u32 &sv_out, const std::vector< unsigned > &vin)
Definition: xsample02.cpp:116
rand_dev
std::random_device rand_dev
Definition: xsample02.cpp:61
sort_map
static void sort_map(map_u32 &hmap, const std::vector< unsigned > &vin)
Definition: xsample02.cpp:79
counting_sort_parallel
static void counting_sort_parallel(sparse_vector_u32 &sv_out, const std::vector< unsigned > &vin)
Definition: xsample02.cpp:145
bm::bvector<>
bm::use_null
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:215
rand_dis
std::uniform_int_distribution rand_dis(1, value_max)
test_size
const unsigned test_size
Definition: xsample02.cpp:55
bm::bvector<>::opt_compress
@ opt_compress
compress blocks when possible (GAP/prefix sum)
Definition: bm.h:134
counting_sort_naive
static void counting_sort_naive(sparse_vector_u32 &sv_out, const std::vector< unsigned > &vin)
Definition: xsample02.cpp:96
bmtimer.h
Timing utilities for benchmarking (internal)
bm::chrono_taker::ct_ops_per_sec
@ ct_ops_per_sec
Definition: bmtimer.h:59
bm::sparse_vector::get
value_type get(size_type idx) const BMNOEXCEPT
get specified element without bounds checking
Definition: bmsparsevec.h:1451
bm::chrono_taker::duration_map_type
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition: bmtimer.h:65
bm::sparse_vector::merge
sparse_vector< Val, BV > & merge(sparse_vector< Val, BV > &sv)
merge with another sparse vector using OR operation Merge is different from join(),...
Definition: bmsparsevec.h:1825
bm::chrono_taker::print_duration_map
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition: bmtimer.h:127
gen
std::mt19937 gen(rand_dev())
bm::bvector::first
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Definition: bm.h:1796
counting_sort_subbatch
unsigned counting_sort_subbatch(sparse_vector_u32 *sv_out, const std::vector< unsigned > *vin)
Definition: xsample02.cpp:128
bm.h
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
map_u32
std::map< unsigned, unsigned > map_u32
Definition: xsample02.cpp:67
bm::sparse_vector::set
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
Definition: bmsparsevec.h:1475
bm::sparse_vector::inc
void inc(size_type idx)
increment specified element by one
Definition: bmsparsevec.h:1667
bm::base_sparse_vector::get_null_bvector
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values or NULL (if not constructed that way)
Definition: bmbmatrix.h:329