BitMagic-C++
bmrandom.h
Go to the documentation of this file.
1 #ifndef BMRANDOM__H__INCLUDED__
2 #define BMRANDOM__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmrandom.h
22  \brief Generation of random subset
23 */
24 
25 #ifndef BM__H__INCLUDED__
26 // BitMagic utility headers do not include main "bm.h" declaration
27 // #include "bm.h" or "bm64.h" explicitly
28 # error missing include (bm.h or bm64.h)
29 #endif
30 
31 
32 #include "bmfunc.h"
33 #include "bmdef.h"
34 
35 #include <stdlib.h>
36 #include <random>
37 #include <algorithm>
38 
39 namespace bm
40 {
41 
42 
43 /*!
44  Class implements algorithm for random subset generation.
45 
46  Implemented method tries to be fair, but doesn't guarantee
47  true randomeness or absense of bias.
48 
49  Performace note:
50  Class holds temporary buffers and variables, so it is recommended to
51  re-use instances over multiple calls.
52 
53  \ingroup setalgo
54 */
55 template<class BV>
57 {
58 public:
59  typedef BV bvector_type;
60  typedef typename BV::size_type size_type;
61 public:
62  random_subset();
64 
65  /// Get random subset of input vector
66  ///
67  /// @param bv_out - destination vector
68  /// @param bv_in - input vector
69  /// @param sample_count - number of bits to pick
70  ///
71  void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72 
73 
74 private:
75  typedef
76  typename BV::blocks_manager_type blocks_manager_type;
77 
78 private:
79  /// simple picking algorithm for small number of items
80  /// in [first,last] range
81  ///
82  void simple_pick(BV& bv_out,
83  const BV& bv_in,
84  size_type sample_count,
85  size_type first,
86  size_type last);
87 
88  void get_subset(BV& bv_out,
89  const BV& bv_in,
90  size_type in_count,
91  size_type sample_count);
92 
93  void get_block_subset(bm::word_t* blk_out,
94  const bm::word_t* blk_src,
95  unsigned count);
96  static
97  unsigned process_word(bm::word_t* blk_out,
98  const bm::word_t* blk_src,
99  unsigned nword,
100  unsigned take_count) BMNOEXCEPT;
101 
102  static
103  void get_random_array(bm::word_t* blk_out,
105  unsigned bit_list_size,
106  unsigned count);
107  static
108  unsigned compute_take_count(unsigned bc,
109  size_type in_count, size_type sample_count) BMNOEXCEPT;
110 
111 
112 private:
114  random_subset& operator=(const random_subset&);
115 private:
116  typename bvector_type::rs_index_type rsi_; ///< RS index (block counts)
117  bvector_type bv_nb_; ///< blocks vector
118  bm::gap_word_t bit_list_[bm::gap_max_bits];
119  bm::word_t* sub_block_;
120 };
121 
122 
123 ///////////////////////////////////////////////////////////////////
124 
125 
126 
127 template<class BV>
129 {
130  sub_block_ = new bm::word_t[bm::set_block_size];
131 }
132 
133 template<class BV>
135 {
136  delete [] sub_block_;
137 }
138 
139 template<class BV>
140 void random_subset<BV>::sample(BV& bv_out,
141  const BV& bv_in,
142  size_type sample_count)
143 {
144  if (sample_count == 0)
145  {
146  bv_out.clear(true);
147  return;
148  }
149 
150  rsi_.init();
151  bv_in.build_rs_index(&rsi_, &bv_nb_);
152 
153  size_type in_count = rsi_.count();
154  if (in_count <= sample_count)
155  {
156  bv_out = bv_in;
157  return;
158  }
159 
160  float pick_ratio = float(sample_count) / float(in_count);
161  if (pick_ratio < 0.054f)
162  {
163  size_type first, last;
164  bool b = bv_in.find_range(first, last);
165  if (!b)
166  return;
167 
168  simple_pick(bv_out, bv_in, sample_count, first, last);
169  return;
170  }
171 
172  if (sample_count > in_count/2)
173  {
174  // build the complement vector and subtract it
175  BV tmp_bv;
176  size_type delta_count = in_count - sample_count;
177 
178  get_subset(tmp_bv, bv_in, in_count, delta_count);
179  bv_out = bv_in;
180  bv_out -= tmp_bv;
181  return;
182  }
183  get_subset(bv_out, bv_in, in_count, sample_count);
184 }
185 
186 template<class BV>
187 void random_subset<BV>::simple_pick(BV& bv_out,
188  const BV& bv_in,
189  size_type sample_count,
190  size_type first,
191  size_type last)
192 {
193  bv_out.clear(true);
194 
195  std::random_device rd;
196  #ifdef BM64ADDR
197  std::mt19937_64 mt_rand(rd());
198  #else
199  std::mt19937 mt_rand(rd());
200  #endif
201  std::uniform_int_distribution<size_type> dist(first, last);
202 
203  while (sample_count)
204  {
205  size_type fidx;
206  size_type ridx = dist(mt_rand); // generate random position
207 
208  BM_ASSERT(ridx >= first && ridx <= last);
209 
210  bool b = bv_in.find(ridx, fidx); // find next valid bit after random
211  BM_ASSERT(b);
212  if (b)
213  {
214  // set true if was false
215  bool is_set = bv_out.set_bit_conditional(fidx, true, false);
216  sample_count -= is_set;
217  while (!is_set) // find next valid (and not set) bit
218  {
219  ++fidx;
220  // searching always left to right may create a bias...
221  b = bv_in.find(fidx, fidx);
222  if (!b)
223  break;
224  if (fidx > last)
225  break;
226  is_set = bv_out.set_bit_conditional(fidx, true, false);
227  sample_count -= is_set;
228  } // while
229  }
230  } // while
231 }
232 
233 
234 template<class BV>
235 void random_subset<BV>::get_subset(BV& bv_out,
236  const BV& bv_in,
237  size_type in_count,
238  size_type sample_count)
239 {
240  bv_out.clear(true);
241  bv_out.resize(bv_in.size());
242 
243  const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
244  blocks_manager_type& bman_out = bv_out.get_blocks_manager();
245 
246  bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
247 
248  size_type first_nb, last_nb;
249  bool b = bv_nb_.find_range(first_nb, last_nb);
250  BM_ASSERT(b);
251  if (!b)
252  return;
253 
254  std::random_device rd;
255  #ifdef BM64ADDR
256  std::mt19937_64 mt_rand(rd());
257  #else
258  std::mt19937 mt_rand(rd());
259  #endif
260  std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
261 
262  size_type curr_sample_count = sample_count;
263  for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
264  {
265  // pick block at random
266  //
267  size_type nb;
268  size_type ridx = dist_nb(mt_rand); // generate random block idx
269  BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
270 
271  b = bv_nb_.find(ridx, nb); // find next valid nb
272  if (!b)
273  {
274  b = bv_nb_.find(first_nb, nb);
275  if (!b)
276  {
277  b = bv_nb_.find(first_nb, nb);
278  BM_ASSERT(!bv_nb_.any());
279  BM_ASSERT(b);
280  return; // cannot find block
281  }
282  }
283  bv_nb_.clear_bit_no_check(nb); // remove from blocks list
284 
285  // calculate proportinal sample count
286  //
287  unsigned bc = rsi_.count(nb);
288  BM_ASSERT(bc && (bc <= 65536));
289  take_count = compute_take_count(bc, in_count, sample_count);
290  if (take_count > curr_sample_count)
291  take_count = unsigned(curr_sample_count);
292  BM_ASSERT(take_count);
293  if (!take_count)
294  continue;
295  {
296  unsigned i0, j0;
297  bm::get_block_coord(nb, i0, j0);
298  const bm::word_t* blk_src = bman_in.get_block(i0, j0);
299  BM_ASSERT(blk_src);
300 
301  // allocate target block
302  bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
303  BM_ASSERT(!blk_out);
304  if (blk_out)
305  {
306  blk_out = bman_out.deoptimize_block(nb);
307  }
308  else
309  {
310  blk_out = bman_out.get_allocator().alloc_bit_block();
311  bman_out.set_block(nb, blk_out);
312  }
313  if (take_count == bc) // whole block take (strange)
314  {
315  // copy the whole src block
316  if (BM_IS_GAP(blk_src))
317  bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
318  else
319  bm::bit_block_copy(blk_out, blk_src);
320  continue;
321  }
322  bm::bit_block_set(blk_out, 0);
323  if (bc < 4096) // use array shuffle
324  {
325  unsigned arr_len;
326  // convert source block to bit-block
327  if (BM_IS_GAP(blk_src))
328  {
329  arr_len = bm::gap_convert_to_arr(bit_list_,
330  BMGAP_PTR(blk_src),
332  }
333  else // bit-block
334  {
335  arr_len = bm::bit_block_convert_to_arr(bit_list_, blk_src, 0);
336  }
337  BM_ASSERT(arr_len);
338  get_random_array(blk_out, bit_list_, arr_len, take_count);
339  }
340  else // dense block
341  {
342  // convert source block to bit-block
343  if (BM_IS_GAP(blk_src))
344  {
345  bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
346  blk_src = tmp_block;
347  }
348  // pick random bits source block to target
349  get_block_subset(blk_out, blk_src, take_count);
350  }
351  }
352  } // for
353 }
354 
355 template<class BV>
356 unsigned random_subset<BV>::compute_take_count(
357  unsigned bc,
358  size_type in_count,
359  size_type sample_count) BMNOEXCEPT
360 {
361  float block_percent = float(bc) / float(in_count);
362  float bits_to_take = float(sample_count) * block_percent;
363  bits_to_take += 0.99f;
364  unsigned to_take = unsigned(bits_to_take);
365  if (to_take > bc)
366  to_take = bc;
367  return to_take;
368 }
369 
370 
371 template<class BV>
372 void random_subset<BV>::get_block_subset(bm::word_t* blk_out,
373  const bm::word_t* blk_src,
374  unsigned take_count)
375 {
376  for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
377  {
378  // pick random scan start and scan direction
379  //
380  unsigned i = unsigned(rand()) % bm::set_block_size;
381  unsigned new_count;
382 
383  for (; i < bm::set_block_size && take_count; ++i)
384  {
385  if (blk_src[i] && (blk_out[i] != blk_src[i]))
386  {
387  new_count = process_word(blk_out, blk_src, i, take_count);
388  take_count -= new_count;
389  }
390  } // for i
391 
392  } // for
393  // if masked scan did not produce enough results..
394  //
395  if (take_count)
396  {
397  // Find all vacant bits: do logical (src SUB out)
398  for (unsigned i = 0; i < bm::set_block_size; ++i)
399  {
400  sub_block_[i] = blk_src[i] & ~blk_out[i];
401  }
402  // now transform vacant bits to array, then pick random elements
403  //
404  unsigned arr_len =
405  bm::bit_block_convert_to_arr(bit_list_, sub_block_, 0);
406  // bm::gap_max_bits,
407  // bm::gap_max_bits,
408  // 0);
409  BM_ASSERT(arr_len);
410  get_random_array(blk_out, bit_list_, arr_len, take_count);
411  }
412 }
413 
414 template<class BV>
415 unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
416  const bm::word_t* blk_src,
417  unsigned nword,
418  unsigned take_count) BMNOEXCEPT
419 {
420  unsigned new_bits, mask;
421  do
422  {
423  mask = unsigned(rand());
424  mask ^= mask << 16;
425  } while (mask == 0);
426 
427  std::random_device rd;
428  #ifdef BM64ADDR
429  std::mt19937_64 mt_rand(rd());
430  #else
431  std::mt19937 mt_rand(rd());
432  #endif
433  unsigned src_rand = blk_src[nword] & mask;
434  new_bits = src_rand & ~blk_out[nword];
435  if (new_bits)
436  {
437  unsigned new_count = bm::word_bitcount(new_bits);
438 
439  // check if we accidentally picked more bits than needed
440  if (new_count > take_count)
441  {
442  BM_ASSERT(take_count);
443 
444  unsigned char blist[64];
445  unsigned arr_size = bm::bitscan_popcnt(new_bits, blist);
446  BM_ASSERT(arr_size == new_count);
447  std::shuffle(blist, blist + arr_size, mt_rand);
448  unsigned value = 0;
449  for (unsigned j = 0; j < take_count; ++j)
450  {
451  value |= (1u << blist[j]);
452  }
453  new_bits = value;
454  new_count = take_count;
455 
456  BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
457  BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
458  }
459 
460  blk_out[nword] |= new_bits;
461  return new_count;
462  }
463  return 0;
464 }
465 
466 
467 template<class BV>
468 void random_subset<BV>::get_random_array(bm::word_t* blk_out,
470  unsigned bit_list_size,
471  unsigned count)
472 {
473  std::random_device rd;
474  #ifdef BM64ADDR
475  std::mt19937_64 mt_rand(rd());
476  #else
477  std::mt19937 mt_rand(rd());
478  #endif
479  std::shuffle(bit_list, bit_list + bit_list_size, mt_rand);
480  for (unsigned i = 0; i < count; ++i)
481  {
482  bm::set_bit(blk_out, bit_list[i]);
483  }
484 }
485 
486 } // namespace
487 
488 #include "bmundef.h"
489 
490 #endif
bm::bvector::rs_index_type
rs_index< allocator_type > rs_index_type
Definition: bm.h:845
bmfunc.h
Bit manipulation primitives (internal)
bm::random_subset::bvector_type
BV bvector_type
Definition: bmrandom.h:59
BM_IS_GAP
#define BM_IS_GAP(ptr)
Definition: bmdef.h:181
bm::set_block_size
const unsigned set_block_size
Definition: bmconst.h:54
bm::gap_convert_to_arr
D gap_convert_to_arr(D *BMRESTRICT dest, const T *BMRESTRICT buf, unsigned dest_len, bool invert=false) BMNOEXCEPT
Convert gap block into array of ints corresponding to 1 bits.
Definition: bmfunc.h:4340
BMGAP_PTR
#define BMGAP_PTR(ptr)
Definition: bmdef.h:179
bmundef.h
pre-processor un-defines to avoid global space pollution (internal)
bm::get_block_coord
BMFORCEINLINE void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) BMNOEXCEPT
Recalc linear bvector block index into 2D matrix coordinates.
Definition: bmfunc.h:148
bm::random_subset::sample
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition: bmrandom.h:140
BMNOEXCEPT
#define BMNOEXCEPT
Definition: bmdef.h:79
bm::gap_word_t
unsigned short gap_word_t
Definition: bmconst.h:77
bm::gap_convert_to_bitset
void gap_convert_to_bitset(unsigned *BMRESTRICT dest, const T *BMRESTRICT buf) BMNOEXCEPT
GAP block to bitblock conversion.
Definition: bmfunc.h:3879
bm::bit_list
unsigned bit_list(T w, B *bits) BMNOEXCEPT
Unpacks word into list of ON bit indexes.
Definition: bmfunc.h:440
BM_ASSERT
#define BM_ASSERT
Definition: bmdef.h:130
bm::random_subset
Definition: bmrandom.h:56
bm::bit_block_convert_to_arr
unsigned bit_block_convert_to_arr(T *BMRESTRICT dest, const unsigned *BMRESTRICT src, bool inverted) BMNOEXCEPT
Convert bit block into an array of ints corresponding to 1 bits.
Definition: bmfunc.h:7924
bm::random_subset::random_subset
random_subset()
Definition: bmrandom.h:128
bmdef.h
Definitions(internal)
bm::gap_max_bits
const unsigned gap_max_bits
Definition: bmconst.h:80
bm::word_bitcount
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition: bmfunc.h:199
bm::random_subset::~random_subset
~random_subset()
Definition: bmrandom.h:134
bm
Definition: bm.h:76
bm::bit_block_copy
void bit_block_copy(bm::word_t *BMRESTRICT dst, const bm::word_t *BMRESTRICT src) BMNOEXCEPT
Bitblock copy operation.
Definition: bmfunc.h:5994
bm::word_t
unsigned int word_t
Definition: bmconst.h:38
bm::bitscan_popcnt
unsigned short bitscan_popcnt(bm::id_t w, B *bits, unsigned short offs) BMNOEXCEPT
Unpacks word into list of ON bit indexes using popcnt method.
Definition: bmfunc.h:477
bm::random_subset::size_type
BV::size_type size_type
Definition: bmrandom.h:60
bm::set_bit
BMFORCEINLINE void set_bit(unsigned *dest, unsigned bitpos) BMNOEXCEPT
Set 1 bit in a block.
Definition: bmfunc.h:3145
bm::bit_block_set
void bit_block_set(bm::word_t *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
Bitblock memset operation.
Definition: bmfunc.h:3861