BitMagic-C++
bmalgo_similarity.h
Go to the documentation of this file.
1#ifndef BMALGO_SIMILARITY__H__INCLUDED__
2#define BMALGO_SIMILARITY__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21#include <algorithm>
22#include <functional>
23#include <vector>
24
25#ifndef BM__H__INCLUDED__
26// BitMagic utility headers do not include main "bm.h" declaration
27// #include "bm.h" or "bm64.h" explicitly
28# error missing include (bm.h or bm64.h)
29#endif
30
31
32#include "bmfunc.h"
33#include "bmdef.h"
34
35#include "bmalgo_impl.h"
36
37namespace bm
38{
39
40/*! Similarity descriptor between two objects (bit vectors, blocks, etc)
41 \internal
42*/
43template <typename SO, unsigned DMD_SZ, typename IDX_VALUE, typename SValue, typename SFunc>
45{
46public:
48 typedef SValue similarity_value_type;
49 typedef SFunc similarity_functor;
50public:
52 : so1_(0), so2_(0), so1_idx_(0), so2_idx_(0)
53 {
54 similarity_ = 0;
55 }
56
57 similarity_descriptor(const SO* so1, const SO* so2,
58 const distance_metric_descriptor* dmd_ptr)
59 :so1_(so1),
60 so2_(so2),
61 so1_idx_(0), so2_idx_(0)
62 {
63 for (size_t i = 0; i < DMD_SZ; ++i)
64 dmd_[i] = dmd_ptr[i];
65 similarity_ = 0;
66 }
67
68 similarity_descriptor(const SO* so1, IDX_VALUE i1,
69 const SO* so2, IDX_VALUE i2,
70 const distance_metric_descriptor* dmd_ptr)
71 :so1_(so1), so2_(so2), so1_idx_(i1), so2_idx_(i2)
72 {
73 for (size_t i = 0; i < DMD_SZ; ++i)
74 dmd_[i] = dmd_ptr[i];
75 similarity_ = 0;
76 }
77
80 so1_(sd.so1_),
81 so2_(sd.so2_),
84 {
85 for (size_t i = 0; i < DMD_SZ; ++i)
86 dmd_[i] = sd.dmd_[i];
87 }
89 {
91 so1_ = sd.so1_; so2_ = sd.so2_;
93 for (size_t i = 0; i < DMD_SZ; ++i)
94 dmd_[i] = sd.dmd_[i];
95 return *this;
96 }
97
98 bool operator > (const similarity_descriptor& sd) const
99 {
100 return similarity_ > sd.similarity_;
101 }
102
103 SValue similarity() const { return similarity_; }
104 void set_similarity(SValue s) { similarity_ = s;}
105
106 const SO* get_first() const { return so1_; }
107 const SO* get_second() const { return so2_; }
108
109 IDX_VALUE get_first_idx() const { return so1_idx_; }
110 IDX_VALUE get_second_idx() const { return so2_idx_; }
111
114
115 void set_metric(size_t i, distance_metric metric)
116 {
117 BM_ASSERT(i < DMD_SZ);
118 dmd_[i].metric = metric;
119 }
120
121
122protected:
123 SValue similarity_; //< final similarity product
124 const SO* so1_; //< object 1 for similarity comparison
125 const SO* so2_; //< object 2 for similarity comparison
126 IDX_VALUE so1_idx_; //< index of object 1
127 IDX_VALUE so2_idx_; //< index of object 2
128 distance_metric_descriptor dmd_[DMD_SZ]; //< array of distance operations defined on objects
129};
130
131/*!
132 Batch of objects for similarity measurement
133 \internal
134*/
135template<class SDESCR>
137{
139 typedef typename SDESCR::similarity_object_type similarity_object_type;
140 typedef typename SDESCR::similarity_value_type similarity_value_type;
141 typedef typename SDESCR::similarity_functor similarity_functor;
142 typedef std::vector<SDESCR> vector_type;
143
144 /// run the similarity calculation using distance metrics engine
146 {
147 for( size_t i = 0; i < descr_vect_.size(); ++i)
148 {
150
151 const similarity_object_type* so1 = sdescr.get_first();
152 const similarity_object_type* so2 = sdescr.get_second();
153
154 distance_metric_descriptor* dit = sdescr.distance_begin();
155 distance_metric_descriptor* dit_end = sdescr.distance_end();
156 bm::distance_operation(*so1, *so2, dit, dit_end);
157
158 // reduce: use functor to compute final similarity metric
160 similarity_value_type d = func(dit, dit_end);
161 sdescr.set_similarity(d);
162 } // for i
163 }
164
165 void sort()
166 {
167 std::sort(descr_vect_.begin(), descr_vect_.end(), std::greater<SDESCR>());
168 }
169
170 void reserve(size_t cap)
171 {
172 descr_vect_.reserve(cap);
173 }
174
176 {
177 descr_vect_.push_back(sdt);
178 }
179
180public:
181 std::vector<SDESCR> descr_vect_;
182};
183
184
185/**
186 Utility function to build jaccard similarity batch for sparse_vector<>
187 \internal
188*/
189template<class SIMBATCH, class SV>
190void build_jaccard_similarity_batch(SIMBATCH& sbatch, const SV& sv)
191{
192
193 size_t planes = sv.get_bmatrix().rows();//slices();
194 sbatch.reserve((planes * planes) / 2);
195
197 dmd[0].metric = bm::COUNT_AND;
198 dmd[1].metric = bm::COUNT_OR;
199
200 // build a batch for triangular distance matrix
201 //
202 for (unsigned i = 0; i < planes; ++i)
203 {
204 if (const typename SV::bvector_type* bv1 = sv.get_slice(i))
205 {
206 for (unsigned j = i+1; j < planes; ++j)
207 {
208 const typename SV::bvector_type* bv2 = sv.get_slice(j);
209 if (bv2 && bv1 != bv2)
210 sbatch.push_back(typename SIMBATCH::similaruty_descriptor_type(bv1, i, bv2, j, &dmd[0]));
211 } // for j
212 }
213
214 } // for i
215}
216
217
218
219} // namespace bm
220
221
222#endif
Algorithms for bvector<>
Definitions(internal)
#define BM_ASSERT
Definition: bmdef.h:139
Bit manipulation primitives (internal)
distance_metric_descriptor dmd_[DMD_SZ]
IDX_VALUE get_second_idx() const
similarity_descriptor(const SO *so1, IDX_VALUE i1, const SO *so2, IDX_VALUE i2, const distance_metric_descriptor *dmd_ptr)
similarity_descriptor(const similarity_descriptor &sd)
similarity_descriptor(const SO *so1, const SO *so2, const distance_metric_descriptor *dmd_ptr)
void set_metric(size_t i, distance_metric metric)
similarity_descriptor & operator=(const similarity_descriptor &sd)
const SO * get_first() const
const SO * get_second() const
distance_metric_descriptor * distance_begin()
distance_metric_descriptor * distance_end()
IDX_VALUE get_first_idx() const
bool operator>(const similarity_descriptor &sd) const
void distance_operation(const BV &bv1, const BV &bv2, distance_metric_descriptor *dmit, distance_metric_descriptor *dmit_end) BMNOEXCEPT
Distance computing template function.
Definition: bmalgo_impl.h:766
distance_metric
Distance metrics codes defined for vectors A and B.
Definition: bmalgo_impl.h:58
@ COUNT_AND
(A & B).count()
Definition: bmalgo_impl.h:59
@ COUNT_OR
(A | B).count()
Definition: bmalgo_impl.h:61
Definition: bm.h:78
void build_jaccard_similarity_batch(SIMBATCH &sbatch, const SV &sv)
Utility function to build jaccard similarity batch for sparse_vector<>
Distance metric descriptor, holds metric code and result.
Definition: bmalgo_impl.h:87
std::vector< SDESCR > descr_vect_
void reserve(size_t cap)
SDESCR::similarity_object_type similarity_object_type
std::vector< SDESCR > vector_type
void push_back(const similaruty_descriptor_type &sdt)
void calculate()
run the similarity calculation using distance metrics engine
SDESCR::similarity_value_type similarity_value_type
SDESCR::similarity_functor similarity_functor
bm::bvector bvector_type
Definition: xsample07a.cpp:94