BitMagic-C++
bmxor.h
Go to the documentation of this file.
1#ifndef BMXORFUNC__H__INCLUDED__
2#define BMXORFUNC__H__INCLUDED__
3/*
4Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmxor.h
22 \brief Functions and utilities for XOR filters (internal)
23*/
24
25#include "bmdef.h"
26#include "bmutil.h"
27
28
29namespace bm
30{
31
32/**
33 Parameters for XOR similarity search.
34 Tuneup params allows to reduce the search space to better
35 balance compression rate vs speed.
36
37 min_lookup_depth - default: 0, minimal search depth to still try to improve
38 max_lookup_depth - default: (2billion) maximum scan search
39 Set a smaller number to improve search time
40
41 Example:
42 min_lookup_depth = 64; max_lookup_depth = 1024;
43
44 stop_gain - default: 65533 - cutoff to stop search when serch found gain
45 better than "stop_gain" absolute value
46 Example:
47 stop_gain = 10000;
48
49 target_gain_ratio - default: 0.89 - cut off ratio between original block cost
50 metric and XOR basedd metric to stop searching.
51 (90% better than the original block is "good enough")
52 Example:
53 target_gain_ratio = 0.50; // 50% improvement is good enough
54
55 min_gaps - default: 3 - minimal size of GAP block to be considered for
56 XOR search candidate
57 */
59{
62 unsigned stop_gain;
64 unsigned min_gaps;
65
68 max_lookup_depth(~0u/2),
70 target_gain_ratio(0.89f),
71 min_gaps(3)
72 {}
73};
74
75
76/**
77 XOR complementarity type between 2 blocks
78 @internal
79 */
81{
87};
88
89/*!
90 Function (32-bit) calculates basic complexity statistics on XOR product of
91 two blocks (b1 XOR b2)
92 @ingroup bitfunc
93 @internal
94*/
95inline
97 const bm::word_t* BMRESTRICT xor_block,
98 unsigned size,
99 unsigned* BMRESTRICT gc,
100 unsigned* BMRESTRICT bc) BMNOEXCEPT
101{
102 BM_ASSERT(gc && bc);
103
104 unsigned gap_count = 1;
105 unsigned bit_count = 0;
106
107 bm::word_t w, w0, w_prev, w_l;
108 w = w0 = *block ^ *xor_block;
109 bit_count += word_bitcount(w);
110
111 const int w_shift = int(sizeof(w) * 8 - 1);
112 w ^= (w >> 1);
113 gap_count += bm::word_bitcount(w);
114 gap_count -= (w_prev = (w0 >> w_shift)); // negative value correction
115
116 const bm::word_t* block_end = block + size;
117 for (++block, ++xor_block; block < block_end; ++block, ++xor_block)
118 {
119 w = w0 = *block ^ *xor_block;
120 bit_count += bm::word_bitcount(w);
121 ++gap_count;
122 if (!w)
123 {
124 gap_count -= !w_prev;
125 w_prev = 0;
126 }
127 else
128 {
129 w ^= (w >> 1);
130 gap_count += bm::word_bitcount(w);
131
132 w_l = w0 & 1;
133 gap_count -= (w0 >> w_shift); // negative value correction
134 gap_count -= !(w_prev ^ w_l); // word border correction
135
136 w_prev = (w0 >> w_shift);
137 }
138 } // for
139
140 *gc = gap_count;
141 *bc = bit_count;
142}
143
144/*!
145 Function (64-bit) calculates basic complexity statistics on XOR product of
146 two blocks (b1 XOR b2)
147 @ingroup bitfunc
148 @internal
149*/
150inline
152 const bm::word_t* BMRESTRICT ref_block,
153 unsigned size,
154 unsigned* BMRESTRICT gc,
155 unsigned* BMRESTRICT bc) BMNOEXCEPT
156{
157 BM_ASSERT(gc && bc);
158
159 unsigned gap_count = 1;
160 unsigned bit_count = 0;
161
162 const bm::id64_t* BMRESTRICT block = (const bm::id64_t*) s_block;
163 const bm::id64_t* BMRESTRICT xor_block = (const bm::id64_t*) ref_block;
164
165 bm::id64_t w, w0, w_prev, w_l;
166 w = w0 = *block ^ *xor_block;
167 bit_count += word_bitcount64(w);
168
169 const int w_shift = int(sizeof(w) * 8 - 1);
170 w ^= (w >> 1);
171 gap_count += bm::word_bitcount64(w);
172 gap_count -= unsigned(w_prev = (w0 >> w_shift)); // negative value correction
173
174 const bm::id64_t* block_end = block + (size/2);
175 for (++block, ++xor_block; block < block_end; ++block, ++xor_block)
176 {
177 w = w0 = *block ^ *xor_block;
178 bit_count += bm::word_bitcount64(w);
179 ++gap_count;
180 if (!w)
181 {
182 gap_count -= !w_prev;
183 w_prev = 0;
184 }
185 else
186 {
187 w ^= (w >> 1);
188 gap_count += bm::word_bitcount64(w);
189 w_l = w0 & 1;
190 gap_count -= unsigned(w0 >> w_shift); // negative value correction
191 gap_count -= !(w_prev ^ w_l); // word border correction
192 w_prev = (w0 >> w_shift);
193 }
194 } // for
195
196 *gc = gap_count;
197 *bc = bit_count;
198}
199
200
201
202/*!
203 Function calculates number of times when bit value changed
204 @internal
205*/
206inline
208 const bm::word_t* BMRESTRICT xor_block,
209 unsigned size,
210 unsigned* BMRESTRICT gc,
211 unsigned* BMRESTRICT bc) BMNOEXCEPT
212{
213#ifdef VECT_BLOCK_XOR_CHANGE
214 VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc);
215#else
216 #ifdef BM64OPT
217 bm::bit_block_xor_change64(block, xor_block, size, gc, bc);
218 #else
219 bm::bit_block_xor_change32(block, xor_block, size, gc, bc);
220 #endif
221#endif
222}
223
224/**
225 Structure to compute XOR gap-count profile by sub-block waves
226 @ingroup bitfunc
227 @internal
228*/
230{
231 // stats for s-block
232 unsigned short sb_gc[bm::block_waves]; ///< GAP counts
233 unsigned short sb_bc[bm::block_waves]; ///< BIT counts
234
235 // stats ref.block XOR mask
236 unsigned short sb_xor_gc[bm::block_waves]; ///< XOR-mask GAP count
237 unsigned short sb_xor_bc[bm::block_waves]; ///< XOR-mask GAP count
238};
239
240/**
241 Capture the XOR filter results (xor block against ref.block)
242 @internal
243 */
245{
247
249 unsigned block_gain; ///< XOR filter improvement (best)
250 size_type ref_idx; ///< reference vector index
251 bm::id64_t xor_d64; ///< recorded digest
252
253 // sub-block waves masks for metrics
257
258 // recorded gains for metrics
259 unsigned gc_gain;
260 unsigned bc_gain;
261 unsigned ibc_gain;
262
263 unsigned xor_gc;
264 unsigned xor_bc;
265
267};
268
269/**
270 XOR match pair
271 @internal
272 */
274{
275 bvector_size_type ref_idx; ///< reference vector index
276 bm::id64_t xor_d64; ///< recorded digest
277
280 : ref_idx(idx), xor_d64(d64)
281 {}
282
283};
284
285/**
286 XOR match chain
287 @internal
288 */
289template<typename BLOCK_IDX> struct block_match_chain
290{
291 BLOCK_IDX nb;
292 unsigned chain_size;
293 unsigned ref_idx[64];
296
298 {
299 if (nb != bmc.nb || chain_size != bmc.chain_size || match != bmc.match)
300 {
301 return false;
302 }
303 for (unsigned i = 0; i < chain_size; ++i)
304 {
305 if (ref_idx[i] != bmc.ref_idx[i])
306 return false;
307 if (match == e_xor_match_EQ)
308 continue;
309 if (xor_d64[i] != bmc.xor_d64[i])
310 return false;
311 }
312 return true;
313 }
314};
315
316/**
317 Greedy algorithm to find additional matches
318 improving the inital best match block on its match type
319
320 @param match_pairs_vect - [out] target vector of best match pairs
321 @param match_vect - [in/out] vector of all found match descriptors
322
323 @return number of new finds (if any)
324 */
325template<typename PVT, typename VT>
326typename VT::size_type
327greedy_refine_match_vector(PVT& match_pairs_vect,
328 VT& match_vect,
329 typename VT::size_type best_ref_idx,
330 bm::id64_t d64,
331 bm::xor_complement_match match_type)
332{
333 BM_ASSERT(match_type && d64);
334 match_pairs_vect.resize(0);
335
336 bm::id64_t d64_acc(d64);
337
338 // pass 1 (exact match)
339 //
340 typename VT::size_type sz = match_vect.size();
341 for (typename VT::size_type i = 0; (i < sz) && (d64_acc != ~0ull); ++i)
342 {
343 block_xor_match_descr& xmd = match_vect[i];
344 if (xmd.ref_idx == best_ref_idx) // self hit
345 continue;
346 if (xmd.match_type == match_type) // best compatible match types
347 {
348 bm::id64_t d64_new = ~~d64_acc & xmd.xor_d64;
349 if (d64_new)
350 {
351 d64_acc |= d64_new; // add mask to accum
352 match_pairs_vect.push_back(bm::match_pair(xmd.ref_idx, d64_new));
353 }
354 xmd.match_type = e_no_xor_match; // mark it to exclude from pass 2
355 }
356 } // for i
357
358 // pass 2 (extended match)
359 //
360 const unsigned min_gain_cut_off = 50;
361 for (typename VT::size_type i = 0; (i < sz) && (d64_acc != ~0ull); ++i)
362 {
363 block_xor_match_descr& xmd = match_vect[i];
364 if (xmd.ref_idx == best_ref_idx || !xmd.match_type) // self hit or none
365 continue;
366 BM_ASSERT(xmd.match_type != match_type);
367
368 bm::id64_t d64_new = 0;
369 switch (match_type)
370 {
371 case e_xor_match_GC:
372 if (xmd.gc_gain > min_gain_cut_off)
373 d64_new = ~~d64_acc & xmd.gc_d64;
374 break;
375 case e_xor_match_BC:
376 if (xmd.bc_gain > min_gain_cut_off)
377 d64_new = ~~d64_acc & xmd.bc_d64;
378 break;
379 case e_xor_match_iBC:
380 if (xmd.ibc_gain > min_gain_cut_off)
381 d64_new = ~~d64_acc & xmd.ibc_d64;
382 break;
383 default:
384 break;
385 } // switch
386
387 if (d64_new) // some improvement found
388 {
389 d64_acc |= d64_new; // add mask to accum
390 match_pairs_vect.push_back(bm::match_pair(xmd.ref_idx, d64_new));
392 }
393 } // for
394
395 return match_pairs_vect.size();
396}
397
398/**
399 Check effective bit-rate for the XOR encode vector
400 @return 1 - < 256 (8bit), 2 - < 65536 (16-bit) or 0 - 32-bit
401 @internal
402 */
403template<typename BMChain, typename RVect>
404unsigned char check_pair_vect_vbr(const BMChain& mchain, const RVect& ref_vect)
405{
406 size_t max_idx = 0;
407 for (size_t i = 0; i < mchain.chain_size; ++i)
408 {
409 bvector_size_type ridx = mchain.ref_idx[i];
410 ridx = ref_vect.get_row_idx(ridx);
411 if (ridx > max_idx)
412 max_idx = ridx;
413 } // for i
414 if (max_idx < 256)
415 return 1;
416 if (max_idx < 65536)
417 return 2;
418 return 0;
419}
420
421
422/**
423 Compute reference (non-XOR) 64-dim complexity descriptor for the
424 s-block.
425 Phase 1 of the XOR filtering process is to establish the base metric
426
427 @internal
428*/
429inline
432 unsigned* BMRESTRICT s_gc,
433 unsigned* BMRESTRICT s_bc) BMNOEXCEPT
434{
435 *s_gc = *s_bc = 0;
436 // TODO: SIMD (for loop can go inside VECT to minimize LUT re-inits)
437 for (unsigned i = 0; i < bm::block_waves; ++i)
438 {
439 unsigned off = (i * bm::set_block_digest_wave_size);
440 const bm::word_t* sub_block = block + off;
441 unsigned gc, bc;
442 // TODO: optimize to compute GC and BC in a single pass
443 #if defined(VECT_BLOCK_CHANGE)
445 #else
446 #ifdef BM64OPT
449 #else
451 #endif
452 #endif
454 sub_block, sub_block + bm::set_block_digest_wave_size);
455 x_descr.sb_bc[i] = (unsigned short) bc;
456 *s_bc += bc;
457 if (i) // wave border correction
458 {
459 bm::word_t w_l = sub_block[-1];
460 bm::word_t w_r = sub_block[0] & 1;
461 w_l >>= 31;
462 gc -= (w_l == w_r);
463 }
464 x_descr.sb_gc[i] = (unsigned short) gc;
465 *s_gc += gc;
466
467 } // for i
468 // TODO: compute and return d64 - use it later
469}
470
471
472/**
473 Build partial XOR product of 2 bit-blocks using digest mask
474
475 @param target_block - target := block ^ xor_block
476 @param block - arg1
477 @param xor_block - arg2
478 @param digest - mask for each block wave to XOR (1) or just copy (0)
479
480 @internal
481*/
482inline
483void bit_block_xor(bm::word_t* target_block,
484 const bm::word_t* block,
485 const bm::word_t* xor_block,
486 bm::id64_t digest) BMNOEXCEPT
487{
488 BM_ASSERT(target_block);
489 BM_ASSERT(block);
490 BM_ASSERT(xor_block);
491 BM_ASSERT(digest);
492
493#ifdef VECT_BIT_BLOCK_XOR
494 VECT_BIT_BLOCK_XOR(target_block, block, xor_block, digest);
495#else
496 for (unsigned i = 0; i < bm::block_waves; ++i)
497 {
498 const bm::id64_t mask = (1ull << i);
499 unsigned off = (i * bm::set_block_digest_wave_size);
500
501 #ifdef BM64OPT
502 const bm::id64_t* sub_block = (const bm::id64_t*)(block + off);
503 bm::id64_t* t_sub_block = (bm::id64_t*)(target_block + off);
504 const bm::id64_t* sub_block_end = sub_block + bm::set_block_digest_wave_size/2;
505 if (digest & mask) // XOR filtered sub-block
506 {
507 const bm::id64_t* xor_sub_block = (const bm::id64_t*)(xor_block + off);
508 for ( ;sub_block < sub_block_end; )
509 {
510 t_sub_block[0] = sub_block[0] ^ xor_sub_block[0];
511 t_sub_block[1] = sub_block[1] ^ xor_sub_block[1];
512 t_sub_block[2] = sub_block[2] ^ xor_sub_block[2];
513 t_sub_block[3] = sub_block[3] ^ xor_sub_block[3];
514 t_sub_block+=4; sub_block+=4; xor_sub_block+=4;
515 } // for
516 }
517 else // just copy the source
518 {
519 for (; sub_block < sub_block_end; t_sub_block+=4, sub_block+=4)
520 {
521 t_sub_block[0] = sub_block[0];
522 t_sub_block[1] = sub_block[1];
523 t_sub_block[2] = sub_block[2];
524 t_sub_block[3] = sub_block[3];
525 } // for
526 }
527 #else
528 const bm::word_t* sub_block = block + off;
529 bm::word_t* t_sub_block = target_block + off;
530 const bm::word_t* sub_block_end = sub_block + bm::set_block_digest_wave_size;
531 if (digest & mask) // XOR filtered sub-block
532 {
533 const bm::word_t* xor_sub_block = xor_block + off;
534 for (; sub_block < sub_block_end; )
535 {
536 t_sub_block[0] = sub_block[0] ^ xor_sub_block[0];
537 t_sub_block[1] = sub_block[1] ^ xor_sub_block[1];
538 t_sub_block[2] = sub_block[2] ^ xor_sub_block[2];
539 t_sub_block[3] = sub_block[3] ^ xor_sub_block[3];
540 t_sub_block+=4; sub_block+=4; xor_sub_block+=4;
541 } // for
542 }
543 else // just copy the source
544 {
545 for (; sub_block < sub_block_end; t_sub_block+=4, sub_block+=4)
546 {
547 t_sub_block[0] = sub_block[0];
548 t_sub_block[1] = sub_block[1];
549 t_sub_block[2] = sub_block[2];
550 t_sub_block[3] = sub_block[3];
551 } // for
552 }
553 #endif
554 } // for i
555#endif
556}
557
558
559/**
560 Build partial XOR product of 2 bit-blocks using digest mask
561
562 @param target_block - target := target ^ xor_block
563 @param xor_block - arg
564 @param digest - mask for each block wave to XOR (1)
565
566 @internal
567*/
568inline
569void bit_block_xor(bm::word_t* target_block, const bm::word_t* xor_block,
570 bm::id64_t digest) BMNOEXCEPT
571{
572 BM_ASSERT(target_block);
573 BM_ASSERT(xor_block);
574 BM_ASSERT(digest);
575
576 #ifdef VECT_BIT_BLOCK_XOR_2WAY
577 VECT_BIT_BLOCK_XOR_2WAY(target_block, xor_block, digest);
578 #else
579 while (digest)
580 {
581 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
582 unsigned wave = bm::word_bitcount64(t - 1);
583 unsigned off = wave * bm::set_block_digest_wave_size;
584
585 #ifdef BM64OPT
586 const bm::id64_t* xor_sub_block = (const bm::id64_t*)(xor_block + off);
587 bm::id64_t* t_sub_block = (bm::id64_t*)(target_block + off);
588 const bm::id64_t* t_sub_block_end = (t_sub_block + bm::set_block_digest_wave_size/2);
589 for (; t_sub_block < t_sub_block_end; t_sub_block+=4, xor_sub_block+=4)
590 {
591 t_sub_block[0] ^= xor_sub_block[0];
592 t_sub_block[1] ^= xor_sub_block[1];
593 t_sub_block[2] ^= xor_sub_block[2];
594 t_sub_block[3] ^= xor_sub_block[3];
595 } // for
596 #else
597 const bm::word_t* xor_sub_block = xor_block + off;
598 bm::word_t* t_sub_block = target_block + off;
599 const bm::word_t* t_sub_block_end = t_sub_block + bm::set_block_digest_wave_size;
600 for (; t_sub_block < t_sub_block_end; t_sub_block+=4, xor_sub_block+=4)
601 {
602 t_sub_block[0] ^= xor_sub_block[0];
603 t_sub_block[1] ^= xor_sub_block[1];
604 t_sub_block[2] ^= xor_sub_block[2];
605 t_sub_block[3] ^= xor_sub_block[3];
606 } // for
607 #endif
608 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
609 } // while
610 #endif
611}
612
613/**
614 List of reference bit-vectors with their true index associations
615
616 Each referece vector would have two alternative indexes:
617 - index(position) in the reference list
618 - index(row) in the external bit-matrix (plane index)
619
620 @internal
621*/
622template<typename BV>
624{
625public:
626 typedef BV bvector_type;
631
632
634 typedef
635 bm::dynamic_heap_matrix<block_match_chain_type, bv_allocator_type>
637
638public:
639
640 /// reset the collection (resize(0))
641 void reset()
642 {
643 rows_acc_ = 0;
644 ref_bvects_.resize(0); ref_bvects_rows_.resize(0);
645 }
646
647 /**
648 Add reference vector
649 @param bv - bvector pointer
650 @param ref_idx - reference (row) index
651 */
652 void add(const bvector_type* bv, size_type ref_idx)
653 {
654 BM_ASSERT(bv);
655 ref_bvects_.push_back(bv);
656 ref_bvects_rows_.push_back(ref_idx);
657 }
658
659 /// Get reference list size
660 size_type size() const BMNOEXCEPT { return (size_type)ref_bvects_.size(); }
661
662 /// Get reference vector by the index in this ref-vector
664 { return ref_bvects_[idx]; }
665
666 /// Get reference row index by the index in this ref-vector
668 { return (size_type)ref_bvects_rows_[idx]; }
669
670 /// not-found value for find methods
671 static
673
674 /// Find vector index by the reference index
675 /// @return ~0 if not found
676 size_type find(std::size_t ref_idx) const BMNOEXCEPT
677 {
678 size_type sz = size();
679 for (size_type i = 0; i < sz; ++i) // TODO: optimization
680 if (ref_idx == ref_bvects_rows_[i])
681 return i;
682 return not_found();
683 }
684
685 /// Find vector index by the pointer
686 /// @return ~0 if not found
688 {
689 size_type sz = size();
690 for (size_type i = 0; i < sz; ++i)
691 if (bv == ref_bvects_[i])
692 return i;
693 return not_found();
694 }
695
696 /// Fill block allocation digest for all vectors in the reference collection
697 /// @param bv_blocks - [out] bvector of blocks statistics
698 ///
699 void fill_alloc_digest(bvector_type& bv_blocks) const
700 {
701 size_type sz = size();
702 if (sz)
703 {
704 for (size_type i = 0; i < sz; ++i)
705 ref_bvects_[i]->fill_alloc_digest(bv_blocks);
707 bv_blocks.optimize(tb);
708 }
709 }
710
711 /// Reset and build vector of references from a basic bit-matrix
712 /// all NULL rows are skipped, not added to the ref.vector
713 /// @sa add_vectors
714 ///
715 template<class BMATR>
716 void build(const BMATR& bmatr)
717 {
718 reset();
719 add_vectors(bmatr);
720 }
721
722 /// Append basic bit-matrix to the list of reference vectors
723 /// @sa build
724 /// @sa add_sparse_vector
725 template<typename BMATR>
726 void add_vectors(const BMATR& bmatr)
727 {
728 size_type rows = bmatr.rows();
729 for (size_type r = 0; r < rows; ++r)
730 if (bvector_type_const_ptr bv = bmatr.get_row(r))
731 add(bv, rows_acc_ + r);
732 rows_acc_ += unsigned(rows);
733 }
734
735 /// Add bit-transposed sparse vector as a bit-matrix
736 /// @sa add_vectors
737 ///
738 template<class SV>
739 void add_sparse_vector(const SV& sv)
740 {
741 add_vectors(sv.get_bmatrix());
742 }
743
744 /** Utility function to resize matrix based on number of vectors and blocks
745 */
747 size_type total_blocks) const
748 {
749 if (total_blocks)
750 matr.resize(ref_bvects_.size(), total_blocks, false /*no-copy*/);
751 else
752 matr.resize(0, 0);
753 }
754
755 /** Calculate blocks digest and resize XOR distance matrix
756 based on total number of available blocks
757 @return true if created ok (false if no blocks found)
758 */
760 bvector_type& bv_blocks) const
761 {
762 fill_alloc_digest(bv_blocks);
763 size_type cnt = bv_blocks.count();
764 if (!cnt)
765 return false;
766 resize_xor_matrix(matr, cnt);
767 return true;
768 }
769
770protected:
771 typedef bm::heap_vector<bvector_type_const_ptr, bv_allocator_type, true> bvptr_vector_type;
772 typedef bm::heap_vector<std::size_t, bv_allocator_type, true> bv_plane_vector_type;
773
774protected:
775 unsigned rows_acc_ = 0; ///< total rows accumulator
776 bvptr_vector_type ref_bvects_; ///< reference vector pointers
777 bv_plane_vector_type ref_bvects_rows_; ///< reference vector row idxs
778};
779
780// --------------------------------------------------------------------------
781//
782// --------------------------------------------------------------------------
783
784/**
785 XOR similarity model
786
787 @internal
788*/
789template<typename BV>
791{
792public:
793 typedef BV bvector_type;
796
797
799 typedef
800 bm::dynamic_heap_matrix<block_match_chain_type, bv_allocator_type>
802
803public:
804 matrix_chain_type matr; ///< model matrix
805 bvector_type bv_blocks; ///< blocks digest
806};
807
808// --------------------------------------------------------------------------
809//
810// --------------------------------------------------------------------------
811
812/**
813 XOR scanner to search for complement-similarities in
814 collections of bit-vectors
815
816 @internal
817*/
818template<typename BV>
820{
821public:
823 typedef BV bvector_type;
826
827 typedef bm::heap_vector<bm::block_xor_match_descr, bv_allocator_type, true>
829 typedef bm::heap_vector<bm::match_pair, bv_allocator_type, true>
833 typedef bm::heap_vector<bm::word_t*, bv_allocator_type, true>
835 typedef bm::heap_vector<unsigned, bv_allocator_type, true>
837
838 typedef bm::heap_vector<bm::block_waves_xor_descr, bv_allocator_type, true>
840
841public:
842
845 {
846 free_blocks();
847 }
848
849
851 { ref_vect_ = ref_vect; }
852
854 { return *ref_vect_; }
855
856 /** Get statistics for the r-(or s-) block
857 @param ri - nb cache index
858 */
860
861 /** Compute statistics for the r-(or s-) block
862 @param block - bit-block target
863 */
865
866 /** Scan for all candidate bit-blocks to find mask or match
867 @return XOR referenece match type
868 */
870 search_best_xor_mask(const bm::word_t* s_block,
871 size_type ri,
872 size_type ridx_from,
873 size_type ridx_to,
874 unsigned i, unsigned j,
875 bm::word_t* tx_block,
876 const bm::xor_sim_params& params);
877 /**
878 Run a search to add possible XOR match chain additions
879 */
881
882
883 /**
884 XOR all match blocks to target using their digest masks
885 */
887 bm::word_t* target_xor_block,
888 const bm::word_t* s_block,
889 size_type s_ri,
890 const match_pairs_vector_type& pm_vect,
891 unsigned i, unsigned j) const BMNOEXCEPT;
892
893 /**
894 Calculate matrix of best XOR match metrics per block
895 for the attached collection of bit-vectors
896 @return true if computed successfully
897 */
898 bool compute_sim_model(xor_sim_model<BV>& sim_model,
899 const bv_ref_vector_type& ref_vect,
900 const bm::xor_sim_params& params);
901
902 /**
903 Calculate matrix of best XOR match metrics per block
904 for the attached collection of bit-vectors
905 @return true if computed successfully
906 */
907 bool compute_sim_model(xor_sim_model<BV> &sim_model,
908 const bm::xor_sim_params& params);
909
910 /**
911 Compute similarity model for block
912 */
914 typename xor_sim_model<BV>::matrix_chain_type &sim_model_matr,
915 size_type nb,
916 size_type nb_rank,
917 const bm::xor_sim_params& params);
918 /**
919 Compute reference complexity descriptor based on XOR vector.
920 Returns the digest of sub-blocks where XOR filtering improved the metric
921 (function needs reference to estimate the improvement).
922
923 part of Phase 2 of the XOR filtering process
924
925 @sa compute_sub_block_complexity_descr
926
927 @internal
928 */
930 const bm::word_t* BMRESTRICT block,
931 bm::id64_t block_d64,
932 const bm::word_t* BMRESTRICT xor_block,
935
936 /**
937 Check if XOR transform simplified block enough for
938 compressibility objective
939 */
940 bool validate_xor(const bm::word_t* xor_block) const BMNOEXCEPT;
941
942 size_type found_ridx() const BMNOEXCEPT { return found_ridx_; }
943
944 /// Return best match type of a found block
945 ///
947 { return x_block_mtype_; }
948
950 { return found_block_xor_; }
951 unsigned get_x_best_metric() const BMNOEXCEPT { return x_best_metric_; }
952 bm::id64_t get_xor_digest() const BMNOEXCEPT { return x_d64_; }
953
954 unsigned get_s_bc() const BMNOEXCEPT { return s_bc_; }
955 unsigned get_s_gc() const BMNOEXCEPT { return s_gc_; }
957 { return s_block_best_metric_; }
958
959
961
962 static
963 bm::xor_complement_match best_metric(unsigned bc, unsigned gc,
964 unsigned* best_metric) BMNOEXCEPT;
965
967 { return match_vect_; }
968
970 { return chain_match_vect_; }
971
972 /// Return block from the reference vector [vect_idx, block_i, block_j]
973 ///
975 unsigned i, unsigned j) const BMNOEXCEPT
976 { return ref_vect_->get_bv(ri)->get_blocks_manager().get_block_ptr(i, j); }
977
978 /// Sync TEMP vector size
979 /// @internal
980 void sync_nb_vect();
981
982protected:
983
984 /// Deoptimize vertical slice of GAP blocks
985 /// @param nb - block number
986 ///
988 const xor_sim_params& params);
989
990 /// Free the collection of temp blocks
991 void free_blocks() BMNOEXCEPT;
992
993
994private:
995 xor_scanner(const xor_scanner&) = delete;
996 xor_scanner& operator=(const xor_scanner&) = delete;
997
998private:
999 BM_DECLARE_TEMP_BLOCK(xor_tmp_block_)
1000
1001 const bv_ref_vector_type* ref_vect_ = 0; ///< ref.vect for XOR filter
1002 bv_blocks_vector_type nb_blocks_vect_; ///< pointers to temp blocks
1003 bv_bcgc_vector_type nb_gc_vect_;
1004 bv_bcgc_vector_type nb_bc_vect_;
1005 bv_xdescr_vector_type nb_xdescr_vect_;
1006
1007 bv_allocator_type alloc_; ///< allocator to produce blocks
1008
1009 bm::block_waves_xor_descr x_descr_; ///< XOR desriptor
1010
1011 // S-block statistics
1012 //
1013 unsigned s_bc_; ///< bitcount
1014 unsigned s_gc_; ///< gap count
1015 unsigned s_block_best_metric_; ///< s-block orig.metric
1016
1017 unsigned x_best_metric_; ///< min(gc, bc, ibc)
1018 bm::xor_complement_match x_block_mtype_; ///< metric type
1019
1020 // scan related metrics
1021 bm::id64_t x_d64_; ///< search digest
1022 size_type found_ridx_; ///< match vector (in references)
1023 const bm::word_t* found_block_xor_;
1024
1025 // match chain members:
1026 //
1027 xor_matches_vector_type match_vect_; ///< vector of match descr
1028 match_pairs_vector_type chain_match_vect_; ///< refined match pairs
1029};
1030
1031// --------------------------------------------------------------------------
1032//
1033// --------------------------------------------------------------------------
1034//
1035// Naming conventions and glossary:
1036//
1037// s_block - source serialization block to be XOR filtered (anchor block)
1038// best_ref_block - best reference block picked for XOR transform
1039// xt_block - s_block XOR best_ref_block
1040//
1041
1042
1043
1044template<typename BV>
1046{
1047 x_descr_ = nb_xdescr_vect_[ri];
1048 s_gc_ = nb_gc_vect_[ri];
1049 s_bc_ = nb_bc_vect_[ri];
1050
1051 x_block_mtype_ = best_metric(s_bc_, s_gc_, &s_block_best_metric_);
1052 x_best_metric_ = s_block_best_metric_;
1053}
1054
1055
1056template<typename BV>
1058{
1059 BM_ASSERT(IS_VALID_ADDR(block));
1060 BM_ASSERT(!BM_IS_GAP(block));
1061
1062 bm::compute_s_block_descr(block, x_descr_, &s_gc_, &s_bc_);
1063
1064 x_block_mtype_ = best_metric(s_bc_, s_gc_, &s_block_best_metric_);
1065 x_best_metric_ = s_block_best_metric_;
1066}
1067// --------------------------------------------------------------------------
1068
1069template<typename BV>
1071 const bm::word_t* BMRESTRICT block,
1072 bm::id64_t block_d64,
1073 const bm::word_t* BMRESTRICT xor_block,
1076{
1077 bm::id64_t d0 = ~~block_d64;
1078
1079 xmd.xor_gc = xmd.xor_bc = 0;
1080
1081 // Pass 1: compute XOR descriptors
1082 //
1083 for (unsigned i = 0; i < bm::block_waves; ++i)
1084 {
1085 unsigned off = (i * bm::set_block_digest_wave_size);
1086 const bm::word_t* sub_block = block + off;
1087 const bm::word_t* xor_sub_block = xor_block + off;
1088
1089 unsigned xor_gc, xor_bc;
1090 bm::bit_block_xor_change(sub_block, xor_sub_block,
1092 &xor_gc, &xor_bc);
1093 x_descr.sb_xor_bc[i] = (unsigned short)xor_bc;
1094 BM_ASSERT(xor_gc);
1095 if (i) // wave border correction
1096 {
1097 bm::word_t w_l = (sub_block[-1] ^ xor_sub_block[-1]);
1098 bm::word_t w_r = (sub_block[0] ^ xor_sub_block[0]) & 1;
1099 w_l >>= 31;
1100 xor_gc -= (w_l == w_r);
1101 }
1102 x_descr.sb_xor_gc[i] = (unsigned short)xor_gc;
1103
1104 xmd.xor_bc += xor_bc;
1105 xmd.xor_gc += xor_gc;
1106
1107 } // for i
1108
1109 // Pass 2: find the best match
1110 //
1111 unsigned block_gc_gain(0), block_bc_gain(0), block_ibc_gain(0);
1112 bm::id64_t gc_digest(0), bc_digest(0), ibc_digest(0);
1113 const unsigned wave_max_bits = bm::set_block_digest_wave_size * 32;
1114
1115 for (unsigned i = 0; i < bm::block_waves; ++i)
1116 {
1117 bm::id64_t dmask = (1ull << i);
1118 if (d0 & dmask)
1119 continue;
1120
1121 unsigned xor_gc = x_descr.sb_xor_gc[i];
1122 if (xor_gc <= 1)
1123 {
1124 gc_digest |= dmask;
1125 block_gc_gain += x_descr.sb_gc[i]; // all previous GAPs are gone
1126 }
1127 else if (xor_gc < x_descr.sb_gc[i]) // some improvement in GAPs
1128 {
1129 gc_digest |= dmask;
1130 block_gc_gain += (x_descr.sb_gc[i] - xor_gc);
1131 }
1132 unsigned xor_bc = x_descr.sb_xor_bc[i];
1133 if (xor_bc < x_descr.sb_bc[i]) // improvement in BITS
1134 {
1135 bc_digest |= dmask;
1136 block_bc_gain += (x_descr.sb_bc[i] - xor_bc);
1137 }
1138 unsigned xor_ibc = wave_max_bits - xor_bc;
1139 unsigned wave_ibc = wave_max_bits - x_descr.sb_bc[i];
1140 if (xor_ibc < wave_ibc) // improvement in 0 BITS
1141 {
1142 ibc_digest |= dmask;
1143 block_ibc_gain += (wave_ibc - xor_ibc);
1144 }
1145
1146 } // for i
1147
1148 // Save metric results into XOR match descriptor
1149 //
1150
1151 xmd.gc_d64 = gc_digest;
1152 xmd.bc_d64 = bc_digest;
1153 xmd.ibc_d64 = ibc_digest;
1154
1155 xmd.gc_gain = block_gc_gain;
1156 xmd.bc_gain = block_bc_gain;
1157 xmd.ibc_gain = block_ibc_gain;
1158
1159
1160 // Find the winning metric and its digest mask
1161 //
1162 if (!(block_gc_gain | block_bc_gain | block_ibc_gain)) // match not found
1163 {
1164 // this is to check if xor filter has
1165 // canceled out a whole sub-block wave
1166 //
1167 bm::id64_t d0_x = ~~bm::calc_block_digest0(xor_block);
1168 if (d0 == d0_x)
1169 {
1170 xmd.match_type = bm::e_xor_match_GC;
1171 xmd.block_gain = bm::block_waves;
1172 xmd.xor_d64 = d0;
1173 return;
1174 }
1175 xmd.match_type = bm::e_no_xor_match;
1176 xmd.block_gain = 0; xmd.xor_d64 = 0;
1177 return;
1178 }
1179
1180 int new_gc = int(s_gc_) - int(block_gc_gain);
1181 if (new_gc < 0)
1182 new_gc = 0;
1183 int new_bc = int(s_bc_) - int(block_bc_gain);
1184 if (new_bc < 0)
1185 new_bc = 0;
1186 int new_ibc = int(bm::gap_max_bits) - int(s_bc_) - int(block_ibc_gain);
1187 if (new_ibc < 0)
1188 new_ibc = 0;
1189
1190
1191 unsigned best_m;
1192 xmd.match_type = best_metric(unsigned(new_bc), unsigned(new_gc), &best_m);
1193 switch (xmd.match_type)
1194 {
1195 case e_xor_match_GC:
1196 if (new_ibc < new_gc)
1197 {
1198 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1199 }
1200 else
1201 {
1202 xmd.block_gain = block_gc_gain; xmd.xor_d64 = gc_digest;
1203 }
1204 break;
1205 case e_xor_match_BC:
1206 if (new_ibc < new_bc)
1207 {
1208 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1209 }
1210 else
1211 {
1212 xmd.block_gain = block_bc_gain; xmd.xor_d64 = bc_digest;
1213 }
1214 break;
1215 case e_xor_match_iBC:
1216 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1217 break;
1218 default:
1219 break;
1220 } // switch
1221#if 0
1222 // Disabled as cases compression ratio degradation in some tests
1223 if (!xmd.xor_d64) // best metric choice did not work try best gain
1224 {
1225 if (block_gc_gain >= block_bc_gain && block_gc_gain >= block_ibc_gain)
1226 {
1227 xmd.block_gain = block_gc_gain; xmd.xor_d64 = gc_digest;
1228 }
1229 else
1230 if (block_bc_gain > block_gc_gain && block_bc_gain > block_ibc_gain)
1231 {
1232 xmd.block_gain = block_bc_gain; xmd.xor_d64 = bc_digest;
1233 }
1234 else
1235 {
1236 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1237 }
1238 }
1239#endif
1240}
1241
1242// --------------------------------------------------------------------------
1243
1244template<typename BV>
1247 size_type s_ri,
1248 size_type ridx_from,
1249 size_type ridx_to,
1250 unsigned i, unsigned j,
1251 bm::word_t* tx_block,
1252 const bm::xor_sim_params& params)
1253{
1254 BM_ASSERT(ridx_from <= ridx_to);
1255 BM_ASSERT(IS_VALID_ADDR(s_block));
1256 BM_ASSERT(tx_block);
1257
1258 if (ridx_to > ref_vect_->size())
1259 ridx_to = ref_vect_->size();
1260
1262 bm::id64_t d64 = 0;
1263 found_block_xor_ = 0;
1264
1265 unsigned best_block_gain = 0;
1266 int best_ri = -1;
1267
1268 match_vect_.resize(0);
1269
1270 unsigned s_gc(0);
1271 bool s_gap = BM_IS_GAP(s_block);
1272 if (s_gap)
1273 {
1274 const bm::gap_word_t* gap_s_block = BMGAP_PTR(s_block);
1275 s_gc = bm::gap_length(gap_s_block);
1276 if (s_gc <= 3)
1277 return e_no_xor_match;
1278 s_block = nb_blocks_vect_.at(s_ri);
1279 BM_ASSERT(s_block);
1280 }
1281 bm::id64_t s_block_d64 = bm::calc_block_digest0(s_block);
1282
1283 // scan pass: over reference vectors
1284 //
1285 if (ridx_to > ridx_from + params.max_lookup_depth)
1286 ridx_to = ridx_from + params.max_lookup_depth;
1287
1288 size_type depth = 0;
1289 for (size_type ri = ridx_from; ri < ridx_to; ++ri, ++depth)
1290 {
1291 const bm::word_t* ref_block = get_ref_block(ri, i, j);
1292 if (BM_IS_GAP(ref_block))
1293 {
1294 const bm::gap_word_t* gap_ref_block = BMGAP_PTR(ref_block);
1295 unsigned r_gc = bm::gap_length(gap_ref_block);
1296 if (r_gc <= 3)
1297 continue;
1298
1299 if (s_gap) // both blocks GAPs - check if XOR does not make sense
1300 {
1301 if (s_gc < r_gc) // S-GAP block is shorter than Ref BLOCK
1302 {
1303 unsigned gc_diff = r_gc - s_gc;
1304 if (gc_diff >= s_gc) // cannot improve anything
1305 continue;
1306 }
1307 }
1308
1309 if (nb_blocks_vect_.size() > ri)
1310 ref_block = nb_blocks_vect_[ri];
1311 }
1312 if (!IS_VALID_ADDR(ref_block))
1313 continue;
1314
1315 BM_ASSERT(s_block != ref_block);
1316
1318 compute_xor_complexity_descr(s_block, s_block_d64, ref_block, x_descr_, xmd);
1319 if (xmd.xor_d64) // candidate XOR block found
1320 {
1321 xmd.ref_idx = ri;
1322 BM_ASSERT(xmd.match_type);
1323 if (xmd.block_gain > best_block_gain)
1324 {
1325 // check if "it is good enough"
1326 BM_ASSERT(x_best_metric_ <= s_block_best_metric_);
1327
1328 best_block_gain = xmd.block_gain;
1329 best_ri = int(ri);
1330 d64 = xmd.xor_d64;
1331 if (xmd.block_gain >= (bm::gap_max_bits-3))
1332 break;
1333 unsigned curr_best;
1334 //bm::xor_complement_match match =
1335 best_metric(xmd.xor_bc, xmd.xor_gc, &curr_best);
1336 float gain_ratio =
1337 float(xmd.block_gain) / float(s_block_best_metric_);
1338
1339 if (depth >= params.min_lookup_depth &&
1340 x_block_mtype_ == xmd.match_type)
1341 {
1342 if (xmd.block_gain >= params.stop_gain)
1343 break;
1344 if (gain_ratio > params.target_gain_ratio)
1345 break;
1346 }
1347 }
1348 match_vect_.push_back(xmd); // place into vector of matches
1349 }
1350 } // for ri
1351
1352 found_ridx_ = size_type(best_ri);
1353 x_d64_ = d64;
1354
1355 if (best_ri != -1) // found some gain, validate it now
1356 {
1357 // assumed that if XOR compression c_level is at the highest
1358 const float bie_bits_per_int = 3.0f; // c_level_ < 6 ? 3.75f : 3.0f;
1359 const unsigned bie_limit =
1360 unsigned(float(bm::gap_max_bits) / bie_bits_per_int);
1361
1362 unsigned xor_bc, xor_gc;
1363 const bm::word_t* ref_block = get_ref_block(size_type(best_ri), i, j);
1364 bool r_gap = BM_IS_GAP(ref_block);
1365 if (r_gap)
1366 ref_block = nb_blocks_vect_[size_type(best_ri)];
1367 found_block_xor_ = ref_block;
1368
1369 // TODO: one pass operation?
1370 bm::bit_block_xor(tx_block, s_block, ref_block, d64);
1371 bm::bit_block_change_bc(tx_block, &xor_gc, &xor_bc);
1372
1373 if (!xor_bc) // check if completely identical block?
1374 {
1375 x_best_metric_ = xor_bc;
1376 found_ridx_ = size_type(best_ri);
1377 x_block_mtype_ = rb_found = e_xor_match_BC;
1378
1379 unsigned block_pos;
1380 bool found = bm::block_find_first_diff(s_block, ref_block, &block_pos);
1381 if (!found)
1382 {
1383 x_block_mtype_ = rb_found = e_xor_match_EQ; x_d64_ = 0;
1384 }
1385 }
1386 else // find the best matching metric (GC, BC, iBC, ..)
1387 {
1388 rb_found = best_metric(xor_bc, xor_gc, &x_best_metric_);
1389
1390 // double check if XOR improves compression
1391 // with accounted serialization overhead
1392 //
1393 if (rb_found)
1394 {
1395 if (x_best_metric_ > bie_limit ||
1396 get_s_block_best() < x_best_metric_)
1397 return e_no_xor_match;
1398
1399 unsigned gain = get_s_block_best() - x_best_metric_;
1400 gain *= 3; // use bit estimate (speculative)
1401 // gain should be greater than overhead for storing
1402 // reference data: xor token, digest-64, block idx
1403 unsigned gain_min = unsigned(sizeof(char) + sizeof(unsigned));
1404 if (d64 != ~0ull) // if mask is all 1s - it is not used
1405 gain_min += (unsigned)sizeof(bm::id64_t);
1406 else
1407 {
1408 if (x_best_metric_ <= 1)
1409 return rb_found;
1410 }
1411 gain_min *= 8; // in bits
1412 if (gain > gain_min)
1413 return rb_found;
1414
1415 return e_no_xor_match;
1416 }
1417 }
1418 }
1419 return rb_found;
1420}
1421
1422// --------------------------------------------------------------------------
1423
1424template<typename BV>
1426{
1427 size_type match_size = 0;
1428 if (x_d64_ == ~0ull || !x_d64_)
1429 return match_size;
1431 match_size = (size_type)
1433 chain_match_vect_, match_vect_, found_ridx_, x_d64_, mtype);
1434 return match_size;
1435}
1436
1437// --------------------------------------------------------------------------
1438
1439template<typename BV>
1441 const bv_ref_vector_type& ref_vect,
1442 const bm::xor_sim_params& params)
1443{
1444 const bv_ref_vector_type* ref_vect_curr = this->ref_vect_; // save ref-vect
1445
1446 ref_vect_ = &ref_vect;
1447 bool sim_ok = compute_sim_model(sim_model, params);
1448
1449 ref_vect_ = ref_vect_curr; // restore state
1450 return sim_ok;
1451}
1452
1453template<typename BV>
1455 const bm::xor_sim_params& params)
1456{
1457 BM_ASSERT(ref_vect_);
1458
1459 sim_model.bv_blocks.clear(false);
1460 bool ret = ref_vect_->build_nb_digest_and_xor_matrix(sim_model.matr,
1461 sim_model.bv_blocks);
1462 if (!ret)
1463 return ret;
1464
1465 sync_nb_vect();
1466
1467 typename bvector_type::enumerator en(sim_model.bv_blocks);
1468 for (size_type col = 0; en.valid(); ++en, ++col)
1469 {
1470 size_type nb = *en;
1471 compute_sim_model(sim_model.matr, nb, col, params);
1472 } // for en
1473 return true;
1474}
1475
1476// --------------------------------------------------------------------------
1477
1478template<typename BV>
1480 typename xor_sim_model<BV>::matrix_chain_type &sim_model_matr,
1481 size_type nb,
1482 size_type nb_rank,
1483 const bm::xor_sim_params& params)
1484{
1485 BM_ASSERT(nb_rank < sim_model_matr.cols());
1486
1487 const float bie_bits_per_int = 3.0f; // c_level_ < 6 ? 3.75f : 3.0f;
1488 const unsigned bie_limit =
1489 unsigned(float(bm::gap_max_bits) / bie_bits_per_int);
1490
1491 deoptimize_gap_blocks(nb, params);
1492
1493 size_type rsize = ref_vect_->size();
1494
1495 unsigned i0, j0;
1496 bm::get_block_coord(nb, i0, j0);
1497
1498 for (size_type ri=0; true; ++ri)
1499 {
1500 bm::block_match_chain<size_type>* m_row = sim_model_matr.row(ri);
1501 bm::block_match_chain<size_type>& bmc = m_row[nb_rank];
1502 bmc.nb = nb;
1503 bmc.chain_size = 0;
1504 bmc.match = e_no_xor_match;
1505
1506 if (ri == rsize-1)
1507 break;
1508
1509 const bm::word_t* s_block = get_ref_block(ri, i0, j0);
1510 if (!IS_VALID_ADDR(s_block))
1511 continue;
1512
1513 if (BM_IS_GAP(s_block))
1514 {
1515 const bm::gap_word_t* gap_s_block = BMGAP_PTR(s_block);
1516 unsigned gc = bm::gap_length(gap_s_block);
1517 if (gc <= params.min_gaps)
1518 continue;
1519 }
1520
1521 // compute s-block best metrics
1523 if (s_block_best_metric_ < 3)
1524 continue;
1525
1526 // scan ref-vector plains for best similarity
1527 //
1528 bmc.match = search_best_xor_mask(s_block, ri, ri+1, rsize,
1529 i0, j0, xor_tmp_block_, params);
1530
1531 // take index in the ref-vector (not translated to a plain number)
1532 //
1533 size_type ridx = found_ridx();
1534 bm::id64_t d64 = get_xor_digest();
1535
1536 size_type chain_size = 0;
1537 if (d64 && d64 != ~0ULL) // something found
1538 {
1539 chain_size = refine_match_chain();
1540 }
1541 switch (bmc.match)
1542 {
1543 case e_no_xor_match:
1544 if (chain_size) // try chain compression for improve
1545 {
1547 BM_ASSERT(chain_size == pm_vect.size());
1548
1549 apply_xor_match_vector(xor_tmp_block_,
1550 s_block, ri,
1551 pm_vect, i0, j0);
1552 unsigned xor_bc, xor_gc;
1553 bm::bit_block_change_bc(xor_tmp_block_, &xor_gc, &xor_bc);
1554
1555 bmc.match = best_metric(xor_bc, xor_gc, &x_best_metric_);
1556 if (bmc.match)
1557 {
1558 if ((x_best_metric_ > bie_limit) ||
1559 (get_s_block_best() < x_best_metric_))
1560 {
1561 bmc.match = e_no_xor_match;
1562 continue;
1563 }
1564 for (size_type k = 0; k < chain_size; ++k)
1565 {
1566 const bm::match_pair& mp = pm_vect[k];
1567 bmc.ref_idx[k] = (unsigned)mp.ref_idx;
1568 bmc.xor_d64[k] = mp.xor_d64;
1569 bmc.chain_size++;
1570 } // for k
1571 }
1572 }
1573 break;
1574 case e_xor_match_EQ:
1575 bmc.chain_size++;
1576 bmc.ref_idx[0] = unsigned(ridx);
1577 break;
1578 default: // improving match found
1579 bmc.chain_size++;
1580 bmc.ref_idx[0] = unsigned(ridx);
1581 bmc.xor_d64[0] = d64;
1582
1583 if (chain_size) // match chain needs no verification
1584 {
1586 BM_ASSERT(chain_size == pm_vect.size());
1587 auto sz = pm_vect.size();
1588 for (size_type k = 0; k < sz; ++k)
1589 {
1590 BM_ASSERT(k < 64);
1591 const bm::match_pair& mp = pm_vect[k];
1592 bmc.ref_idx[k+1] = (unsigned)mp.ref_idx;
1593 bmc.xor_d64[k+1] = mp.xor_d64;
1594 bmc.chain_size++;
1595 } // for k
1596 }
1597
1598 } // switch
1599
1600 } // for ri
1601}
1602
1603
1604// --------------------------------------------------------------------------
1605
1606template<typename BV>
1608 bm::word_t* target_xor_block,
1609 const bm::word_t* s_block,
1610 size_type s_ri,
1611 const match_pairs_vector_type& pm_vect,
1612 unsigned i, unsigned j) const BMNOEXCEPT
1613{
1614 bool s_gap = BM_IS_GAP(s_block);
1615 if (s_gap)
1616 {
1617 s_block = nb_blocks_vect_.at(s_ri);
1618 BM_ASSERT(s_block);
1619 }
1620 auto sz = pm_vect.size();
1621 for (typename match_pairs_vector_type::size_type k = 0; k < sz; ++k)
1622 {
1623 const bm::match_pair& mp = pm_vect[k];
1624 const bm::word_t* ref_block = get_ref_block(mp.ref_idx, i, j);
1625 if (BM_IS_GAP(ref_block))
1626 {
1627 ref_block = nb_blocks_vect_[mp.ref_idx];
1628 BM_ASSERT(ref_block);
1629 }
1630 if (!k)
1631 bm::bit_block_xor(target_xor_block, s_block, ref_block, mp.xor_d64);
1632 else
1633 bm::bit_block_xor(target_xor_block, ref_block, mp.xor_d64);
1634 } // for k
1635}
1636
1637// --------------------------------------------------------------------------
1638
1639template<typename BV>
1641xor_scanner<BV>::best_metric(unsigned bc, unsigned gc,
1642 unsigned* best_metric) BMNOEXCEPT
1643{
1645 unsigned ibc = bm::gap_max_bits - bc;
1646 if (!ibc)
1647 {
1648 *best_metric = gc;
1649 return e_xor_match_GC;
1650 }
1651 if (gc < bc) // GC < BC
1652 {
1653 if (gc <= ibc)
1654 {
1655 *best_metric = gc;
1656 return e_xor_match_GC;
1657 }
1658 }
1659 else // GC >= BC
1660 {
1661 if (bc <= ibc)
1662 {
1663 *best_metric = bc;
1664 return e_xor_match_BC;
1665 }
1666 }
1667 *best_metric = ibc;
1668 return e_xor_match_iBC;
1669}
1670
1671// --------------------------------------------------------------------------
1672
1673
1674template<typename BV>
1676{
1677 size_t sz = nb_blocks_vect_.size();
1678 for (size_t i = 0; i < sz; ++i)
1679 {
1680 bm::word_t* blk = nb_blocks_vect_[i];
1681 if (blk)
1682 alloc_.free_bit_block(blk);
1683 }
1684 nb_blocks_vect_.resize(0);
1685}
1686
1687// --------------------------------------------------------------------------
1688
1689template<typename BV>
1691 const xor_sim_params& params)
1692{
1693 size_type rsize = ref_vect_->size();
1694 BM_ASSERT(nb_blocks_vect_.size() == rsize);
1695 unsigned i0, j0;
1696 bm::get_block_coord(nb, i0, j0);
1697
1698 for (size_type ri=0; ri < rsize; ++ri)
1699 {
1700 const bm::word_t* block = get_ref_block(ri, i0, j0);
1701 if (BM_IS_GAP(block))
1702 {
1703 const bm::gap_word_t* gap_block = BMGAP_PTR(block);
1704 unsigned gc = bm::gap_length(gap_block);
1705 if (gc <= params.min_gaps)
1706 continue;
1707
1708 bm::word_t* t_block = nb_blocks_vect_.at(ri);
1709 if (!t_block)
1710 {
1711 t_block = alloc_.alloc_bit_block();
1712 nb_blocks_vect_[ri] = t_block;
1713 }
1714 bm::gap_convert_to_bitset(t_block, BMGAP_PTR(block));
1715 block = t_block;
1716 }
1717 if (!IS_VALID_ADDR(block))
1718 continue;
1719
1720 block_waves_xor_descr& x_descr = nb_xdescr_vect_[ri];
1721 unsigned gc, bc;
1722 bm::compute_s_block_descr(block, x_descr, &gc, &bc);
1723 nb_gc_vect_[ri] = gc;
1724 nb_bc_vect_[ri] = bc;
1725 } // for ri
1726
1727}
1728
1729// --------------------------------------------------------------------------
1730
1731template<typename BV>
1733{
1734 size_type rsize = ref_vect_->size();
1735 if (nb_blocks_vect_.size() == rsize)
1736 return;
1737 free_blocks();
1738 nb_blocks_vect_.resize(rsize);
1739 bm::word_t** vect_data = nb_blocks_vect_.data();
1740 for (size_type i = 0; i < rsize; ++i)
1741 vect_data[i] = 0;
1742 nb_gc_vect_.resize(rsize);
1743 nb_bc_vect_.resize(rsize);
1744 nb_xdescr_vect_.resize(rsize);
1745}
1746
1747// --------------------------------------------------------------------------
1748
1749} // namespace bm
1750
1751#endif
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d)
Definition: bmavx2.h:3332
#define VECT_BLOCK_CHANGE(block, size)
Definition: bmavx2.h:3314
#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d)
Definition: bmavx2.h:3335
#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc)
Definition: bmavx2.h:3317
Definitions(internal)
#define IS_VALID_ADDR(addr)
Definition: bmdef.h:161
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BMGAP_PTR(ptr)
Definition: bmdef.h:189
#define BM_IS_GAP(ptr)
Definition: bmdef.h:191
#define BM_ASSERT
Definition: bmdef.h:139
Bit manipulation primitives (internal)
List of reference bit-vectors with their true index associations.
Definition: bmxor.h:624
bvector_type::size_type size_type
Definition: bmxor.h:627
void add_vectors(const BMATR &bmatr)
Append basic bit-matrix to the list of reference vectors.
Definition: bmxor.h:726
bm::block_match_chain< size_type > block_match_chain_type
Definition: bmxor.h:633
void fill_alloc_digest(bvector_type &bv_blocks) const
Fill block allocation digest for all vectors in the reference collection.
Definition: bmxor.h:699
bm::heap_vector< std::size_t, bv_allocator_type, true > bv_plane_vector_type
Definition: bmxor.h:772
size_type size() const BMNOEXCEPT
Get reference list size.
Definition: bmxor.h:660
static size_type not_found() BMNOEXCEPT
not-found value for find methods
Definition: bmxor.h:672
bvector_type * bvector_type_ptr
Definition: bmxor.h:628
bool build_nb_digest_and_xor_matrix(matrix_chain_type &matr, bvector_type &bv_blocks) const
Calculate blocks digest and resize XOR distance matrix based on total number of available blocks.
Definition: bmxor.h:759
const bvector_type * bvector_type_const_ptr
Definition: bmxor.h:629
void add(const bvector_type *bv, size_type ref_idx)
Add reference vector.
Definition: bmxor.h:652
const bvector_type * get_bv(size_type idx) const BMNOEXCEPT
Get reference vector by the index in this ref-vector.
Definition: bmxor.h:663
size_type find_bv(const bvector_type *bv) const BMNOEXCEPT
Find vector index by the pointer.
Definition: bmxor.h:687
size_type get_row_idx(size_type idx) const BMNOEXCEPT
Get reference row index by the index in this ref-vector.
Definition: bmxor.h:667
void reset()
reset the collection (resize(0))
Definition: bmxor.h:641
void build(const BMATR &bmatr)
Reset and build vector of references from a basic bit-matrix all NULL rows are skipped,...
Definition: bmxor.h:716
bm::dynamic_heap_matrix< block_match_chain_type, bv_allocator_type > matrix_chain_type
Definition: bmxor.h:636
size_type find(std::size_t ref_idx) const BMNOEXCEPT
Find vector index by the reference index.
Definition: bmxor.h:676
bm::heap_vector< bvector_type_const_ptr, bv_allocator_type, true > bvptr_vector_type
Definition: bmxor.h:771
void resize_xor_matrix(matrix_chain_type &matr, size_type total_blocks) const
Utility function to resize matrix based on number of vectors and blocks.
Definition: bmxor.h:746
bvector_type::allocator_type bv_allocator_type
Definition: bmxor.h:630
void add_sparse_vector(const SV &sv)
Add bit-transposed sparse vector as a bit-matrix.
Definition: bmxor.h:739
bv_plane_vector_type ref_bvects_rows_
reference vector row idxs
Definition: bmxor.h:777
unsigned rows_acc_
total rows accumulator
Definition: bmxor.h:775
bvptr_vector_type ref_bvects_
reference vector pointers
Definition: bmxor.h:776
Constant iterator designed to enumerate "ON" bits.
Definition: bm.h:603
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Definition: bm.h:283
bvector_size_type size_type
Definition: bm.h:121
Alloc allocator_type
Definition: bm.h:117
XOR scanner to search for complement-similarities in collections of bit-vectors.
Definition: bmxor.h:820
unsigned get_s_gc() const BMNOEXCEPT
Definition: bmxor.h:955
const bm::word_t * get_ref_block(size_type ri, unsigned i, unsigned j) const BMNOEXCEPT
Return block from the reference vector [vect_idx, block_i, block_j].
Definition: bmxor.h:974
static bm::xor_complement_match best_metric(unsigned bc, unsigned gc, unsigned *best_metric) BMNOEXCEPT
Definition: bmxor.h:1641
bm::bv_ref_vector< BV > bv_ref_vector_type
Definition: bmxor.h:822
bm::heap_vector< unsigned, bv_allocator_type, true > bv_bcgc_vector_type
Definition: bmxor.h:836
void sync_nb_vect()
Sync TEMP vector size.
Definition: bmxor.h:1732
bvector_type::size_type size_type
Definition: bmxor.h:825
bm::xor_complement_match search_best_xor_mask(const bm::word_t *s_block, size_type ri, size_type ridx_from, size_type ridx_to, unsigned i, unsigned j, bm::word_t *tx_block, const bm::xor_sim_params &params)
Scan for all candidate bit-blocks to find mask or match.
Definition: bmxor.h:1246
void free_blocks() BMNOEXCEPT
Free the collection of temp blocks.
Definition: bmxor.h:1675
unsigned get_x_best_metric() const BMNOEXCEPT
Definition: bmxor.h:951
bool validate_xor(const bm::word_t *xor_block) const BMNOEXCEPT
Check if XOR transform simplified block enough for compressibility objective.
bool compute_sim_model(xor_sim_model< BV > &sim_model, const bv_ref_vector_type &ref_vect, const bm::xor_sim_params &params)
Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors.
Definition: bmxor.h:1440
void get_s_block_stats(size_type ri) BMNOEXCEPT
Get statistics for the r-(or s-) block.
Definition: bmxor.h:1045
xor_matches_vector_type & get_match_vector() BMNOEXCEPT
Definition: bmxor.h:966
match_pairs_vector_type & get_match_pairs() BMNOEXCEPT
Definition: bmxor.h:969
size_type refine_match_chain()
Run a search to add possible XOR match chain additions.
Definition: bmxor.h:1425
bm::id64_t get_xor_digest() const BMNOEXCEPT
Definition: bmxor.h:952
bvector_type::allocator_type bv_allocator_type
Definition: bmxor.h:824
unsigned get_s_bc() const BMNOEXCEPT
Definition: bmxor.h:954
bv_ref_vector_type::matrix_chain_type matrix_chain_type
Definition: bmxor.h:832
void deoptimize_gap_blocks(size_type nb, const xor_sim_params &params)
Deoptimize vertical slice of GAP blocks.
Definition: bmxor.h:1690
unsigned get_s_block_best() const BMNOEXCEPT
Definition: bmxor.h:956
bm::heap_vector< bm::block_xor_match_descr, bv_allocator_type, true > xor_matches_vector_type
Definition: bmxor.h:828
bm::block_waves_xor_descr & get_descr() BMNOEXCEPT
Definition: bmxor.h:960
void compute_s_block_stats(const bm::word_t *block) BMNOEXCEPT
Compute statistics for the r-(or s-) block.
Definition: bmxor.h:1057
bm::xor_complement_match get_best_match_type() const BMNOEXCEPT
Return best match type of a found block.
Definition: bmxor.h:946
bm::heap_vector< bm::block_waves_xor_descr, bv_allocator_type, true > bv_xdescr_vector_type
Definition: bmxor.h:839
const bv_ref_vector_type & get_ref_vector() const BMNOEXCEPT
Definition: bmxor.h:853
BV bvector_type
Definition: bmxor.h:823
size_type found_ridx() const BMNOEXCEPT
Definition: bmxor.h:942
void apply_xor_match_vector(bm::word_t *target_xor_block, const bm::word_t *s_block, size_type s_ri, const match_pairs_vector_type &pm_vect, unsigned i, unsigned j) const BMNOEXCEPT
XOR all match blocks to target using their digest masks.
Definition: bmxor.h:1607
void compute_xor_complexity_descr(const bm::word_t *BMRESTRICT block, bm::id64_t block_d64, const bm::word_t *BMRESTRICT xor_block, bm::block_waves_xor_descr &BMRESTRICT x_descr, bm::block_xor_match_descr &BMRESTRICT xmd) const BMNOEXCEPT
Compute reference complexity descriptor based on XOR vector.
Definition: bmxor.h:1070
void set_ref_vector(const bv_ref_vector_type *ref_vect) BMNOEXCEPT
Definition: bmxor.h:850
const bm::word_t * get_found_block() const BMNOEXCEPT
Definition: bmxor.h:949
bm::heap_vector< bm::match_pair, bv_allocator_type, true > match_pairs_vector_type
Definition: bmxor.h:830
bm::heap_vector< bm::word_t *, bv_allocator_type, true > bv_blocks_vector_type
Definition: bmxor.h:834
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition: bmutil.h:573
void bit_block_xor_change64(const bm::word_t *BMRESTRICT s_block, const bm::word_t *BMRESTRICT ref_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
Definition: bmxor.h:151
BMFORCEINLINE unsigned bit_count_min_unroll(const bm::word_t *BMRESTRICT block, const bm::word_t *BMRESTRICT block_end) BMNOEXCEPT
Bitcount for bit block without agressive unrolling.
Definition: bmfunc.h:4962
void bit_block_xor_change32(const bm::word_t *BMRESTRICT block, const bm::word_t *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
Definition: bmxor.h:96
BMFORCEINLINE unsigned word_bitcount64(bm::id64_t x) BMNOEXCEPT
Definition: bmutil.h:596
bm::id64_t calc_block_digest0(const bm::word_t *const block) BMNOEXCEPT
Compute digest for 64 non-zero areas.
Definition: bmfunc.h:1092
bm::id64_t bit_block_xor(bm::word_t *BMRESTRICT dst, const bm::word_t *BMRESTRICT src) BMNOEXCEPT
Plain bitblock XOR operation. Function does not analyse availability of source and destination blocks...
Definition: bmfunc.h:8166
void gap_convert_to_bitset(unsigned *BMRESTRICT dest, const T *BMRESTRICT buf, unsigned len=0) BMNOEXCEPT
GAP block to bitblock conversion.
Definition: bmfunc.h:4441
BMFORCEINLINE bm::gap_word_t gap_length(const bm::gap_word_t *BMRESTRICT buf) BMNOEXCEPT
Returs GAP block length.
Definition: bmfunc.h:1575
Definition: bm.h:78
xor_complement_match
XOR complementarity type between 2 blocks.
Definition: bmxor.h:81
@ e_no_xor_match
Definition: bmxor.h:82
@ e_xor_match_GC
Definition: bmxor.h:83
@ e_xor_match_BC
Definition: bmxor.h:84
@ e_xor_match_EQ
Definition: bmxor.h:86
@ e_xor_match_iBC
Definition: bmxor.h:85
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
void bit_block_change_bc(const bm::word_t *BMRESTRICT block, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
Definition: bmfunc.h:5218
unsigned int word_t
Definition: bmconst.h:39
unsigned char check_pair_vect_vbr(const BMChain &mchain, const RVect &ref_vect)
Check effective bit-rate for the XOR encode vector.
Definition: bmxor.h:404
void compute_s_block_descr(const bm::word_t *BMRESTRICT block, block_waves_xor_descr &BMRESTRICT x_descr, unsigned *BMRESTRICT s_gc, unsigned *BMRESTRICT s_bc) BMNOEXCEPT
Compute reference (non-XOR) 64-dim complexity descriptor for the s-block.
Definition: bmxor.h:430
unsigned bit_block_change64(const bm::word_t *BMRESTRICT in_block, unsigned size) BMNOEXCEPT
Definition: bmfunc.h:5169
BMFORCEINLINE void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) BMNOEXCEPT
Recalc linear bvector block index into 2D matrix coordinates.
Definition: bmfunc.h:172
VT::size_type greedy_refine_match_vector(PVT &match_pairs_vect, VT &match_vect, typename VT::size_type best_ref_idx, bm::id64_t d64, bm::xor_complement_match match_type)
Greedy algorithm to find additional matches improving the inital best match block on its match type.
Definition: bmxor.h:327
bool block_find_first_diff(const bm::word_t *BMRESTRICT blk, const bm::word_t *BMRESTRICT arg_blk, unsigned *BMRESTRICT pos) BMNOEXCEPT
Find first bit which is different between two blocks (GAP or bit)
Definition: bmfunc.h:8953
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
unsigned bit_block_change32(const bm::word_t *BMRESTRICT block, unsigned size) BMNOEXCEPT
Definition: bmfunc.h:5127
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition: bmutil.h:335
unsigned short gap_word_t
Definition: bmconst.h:78
bm::id_t bvector_size_type
Definition: bm.h:103
const unsigned gap_max_bits
Definition: bmconst.h:81
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345
void bit_block_xor_change(const bm::word_t *BMRESTRICT block, const bm::word_t *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
Definition: bmxor.h:207
XOR match chain.
Definition: bmxor.h:290
BLOCK_IDX nb
Definition: bmxor.h:291
unsigned chain_size
Definition: bmxor.h:292
bool operator==(const block_match_chain &bmc) const BMNOEXCEPT
Definition: bmxor.h:297
bm::id64_t xor_d64[64]
Definition: bmxor.h:294
unsigned ref_idx[64]
Definition: bmxor.h:293
bm::xor_complement_match match
Definition: bmxor.h:295
Structure to compute XOR gap-count profile by sub-block waves.
Definition: bmxor.h:230
unsigned short sb_bc[bm::block_waves]
BIT counts.
Definition: bmxor.h:233
unsigned short sb_gc[bm::block_waves]
GAP counts.
Definition: bmxor.h:232
unsigned short sb_xor_gc[bm::block_waves]
XOR-mask GAP count.
Definition: bmxor.h:236
unsigned short sb_xor_bc[bm::block_waves]
XOR-mask GAP count.
Definition: bmxor.h:237
Capture the XOR filter results (xor block against ref.block)
Definition: bmxor.h:245
size_type ref_idx
reference vector index
Definition: bmxor.h:250
bm::id64_t xor_d64
recorded digest
Definition: bmxor.h:251
bvector_size_type size_type
Definition: bmxor.h:246
unsigned block_gain
XOR filter improvement (best)
Definition: bmxor.h:249
bm::id64_t ibc_d64
Definition: bmxor.h:256
bm::xor_complement_match match_type
match type
Definition: bmxor.h:248
XOR match pair.
Definition: bmxor.h:274
bvector_size_type ref_idx
reference vector index
Definition: bmxor.h:275
bm::id64_t xor_d64
recorded digest
Definition: bmxor.h:276
match_pair(bvector_size_type idx, bm::id64_t d64)
Definition: bmxor.h:279
XOR similarity model.
Definition: bmxor.h:791
bm::dynamic_heap_matrix< block_match_chain_type, bv_allocator_type > matrix_chain_type
Definition: bmxor.h:801
bm::block_match_chain< size_type > block_match_chain_type
Definition: bmxor.h:798
bvector_type::size_type size_type
Definition: bmxor.h:794
matrix_chain_type matr
model matrix
Definition: bmxor.h:804
bvector_type bv_blocks
blocks digest
Definition: bmxor.h:805
bvector_type::allocator_type bv_allocator_type
Definition: bmxor.h:795
Parameters for XOR similarity search.
Definition: bmxor.h:59
unsigned stop_gain
Definition: bmxor.h:62
unsigned min_lookup_depth
Definition: bmxor.h:60
unsigned min_gaps
Definition: bmxor.h:64
unsigned max_lookup_depth
Definition: bmxor.h:61
float target_gain_ratio
Definition: bmxor.h:63