BitMagic-C++
bmsse2.h
Go to the documentation of this file.
1#ifndef BMSSE2__H__INCLUDED__
2#define BMSSE2__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse2.h
22 \brief Compute functions for SSE2 SIMD instruction set (internal)
23*/
24
25#if !defined(__arm64__) && !defined(__arm__)
26#ifndef BMWASMSIMDOPT
27#include<mmintrin.h>
28#endif
29#include<emmintrin.h>
30#endif
31
32#include "bmdef.h"
33#include "bmutil.h"
34#include "bmsse_util.h"
35
36
37#ifdef __GNUG__
38#pragma GCC diagnostic push
39#pragma GCC diagnostic ignored "-Wconversion"
40#endif
41
42namespace bm
43{
44
45
46/*!
47 SSE2 optimized bitcounting function implements parallel bitcounting
48 algorithm for SSE2 instruction set.
49
50<pre>
51unsigned CalcBitCount32(unsigned b)
52{
53 b = (b & 0x55555555) + (b >> 1 & 0x55555555);
54 b = (b & 0x33333333) + (b >> 2 & 0x33333333);
55 b = (b + (b >> 4)) & 0x0F0F0F0F;
56 b = b + (b >> 8);
57 b = (b + (b >> 16)) & 0x0000003F;
58 return b;
59}
60</pre>
61
62 @ingroup SSE2
63
64*/
65inline
66bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
67{
68 const unsigned mu1 = 0x55555555;
69 const unsigned mu2 = 0x33333333;
70 const unsigned mu3 = 0x0F0F0F0F;
71 const unsigned mu4 = 0x0000003F;
72
73 // Loading masks
74 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
75 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
76 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
77 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
78 __m128i mcnt;
79 mcnt = _mm_xor_si128(m1, m1); // cnt = 0
80
81 __m128i tmp1, tmp2;
82 do
83 {
84 __m128i b = _mm_load_si128(block);
85 ++block;
86
87 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
88 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
89 tmp1 = _mm_and_si128(tmp1, m1);
90 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
91 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
92
93 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
94 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
95 tmp1 = _mm_and_si128(tmp1, m2);
96 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
97 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
98
99 // b = (b + (b >> 4)) & 0x0F0F0F0F;
100 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
101 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
102 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
103
104 // b = b + (b >> 8);
105 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
106 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
107
108 // b = (b + (b >> 16)) & 0x0000003F;
109 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
110 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
111 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
112
113 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
114
115 } while (block < block_end);
116
117
119 _mm_store_si128((__m128i*)tcnt, mcnt);
120
121 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
122}
123
124
125
126template<class Func>
128 const __m128i* BMRESTRICT block_end,
129 const __m128i* BMRESTRICT mask_block,
130 Func sse2_func)
131{
132 const unsigned mu1 = 0x55555555;
133 const unsigned mu2 = 0x33333333;
134 const unsigned mu3 = 0x0F0F0F0F;
135 const unsigned mu4 = 0x0000003F;
136
137 // Loading masks
138 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
139 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
140 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
141 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
142 __m128i mcnt;
143 mcnt = _mm_xor_si128(m1, m1); // cnt = 0
144 do
145 {
146 __m128i tmp1, tmp2;
147 __m128i b = _mm_load_si128(block++);
148
149 tmp1 = _mm_load_si128(mask_block++);
150
151 b = sse2_func(b, tmp1);
152
153 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
154 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
155 tmp1 = _mm_and_si128(tmp1, m1);
156 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
157 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
158
159 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
160 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
161 tmp1 = _mm_and_si128(tmp1, m2);
162 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
163 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
164
165 // b = (b + (b >> 4)) & 0x0F0F0F0F;
166 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
167 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
168 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
169
170 // b = b + (b >> 8);
171 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
172 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
173
174 // b = (b + (b >> 16)) & 0x0000003F;
175 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
176 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
177 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
178
179 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
180
181 } while (block < block_end);
182
184 _mm_store_si128((__m128i*)tcnt, mcnt);
185
186 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
187}
188
189/*!
190 @brief check if block is all zero bits
191 @ingroup SSE2
192*/
193inline
194bool sse2_is_all_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
195{
196 __m128i w;
197 const __m128i maskz = _mm_setzero_si128();
198 const __m128i* BMRESTRICT block_end =
199 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
200
201 do
202 {
203 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
204 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
205 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
206 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
207 if (m1 != 0xFFFF || m2 != 0xFFFF)
208 return false;
209 block += 4;
210 } while (block < block_end);
211 return true;
212}
213
214/*!
215 @brief check if block is all ONE bits
216 @ingroup SSE2
217*/
218inline
219bool sse2_is_all_one(const __m128i* BMRESTRICT block) BMNOEXCEPT
220{
221 __m128i w;
222 const __m128i mask1 = _mm_set_epi32 (~0u, ~0u, ~0u, ~0u);
223 const __m128i* BMRESTRICT block_end =
224 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
225
226 do
227 {
228 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
229 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
230 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
231 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
232 if (m1 != 0xFFFF || m2 != 0xFFFF)
233 return false;
234 block+=4;
235 } while (block < block_end);
236 return true;
237}
238
239/*!
240 @brief check if digest stride is all zero bits
241 @ingroup SSE2
242*/
244bool sse2_is_digest_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
245{
246 const __m128i maskz = _mm_setzero_si128();
247
248 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
249 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
250 wA = _mm_or_si128(wA, wB);
251 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
252
253 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
254 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
255 wA = _mm_or_si128(wA, wB);
256 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
257
258 if (m1 != 0xFFFF || m2 != 0xFFFF)
259 return false;
260 return true;
261}
262
263/*!
264 @brief set digest stride to 0xFF.. or 0x0 value
265 @ingroup SSE2
266*/
268void sse2_block_set_digest(__m128i* dst, unsigned value) BMNOEXCEPT
269{
270 __m128i mV = _mm_set1_epi32(int(value));
271 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
272 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
273 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
274 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
275}
276
277
278/**
279 Build partial XOR product of 2 bit-blocks using digest mask
280
281 @param target_block - target := block ^ xor_block
282 @param block - arg1
283 @param xor_block - arg2
284 @param digest - mask for each block wave to XOR (1) or just copy (0)
285
286 @ingroup SSE2
287*/
288inline
290 const bm::word_t* block,
291 const bm::word_t* xor_block,
292 bm::id64_t digest) BMNOEXCEPT
293{
294 for (unsigned i = 0; i < bm::block_waves; ++i)
295 {
296 const bm::id64_t mask = (1ull << i);
297 unsigned off = (i * bm::set_block_digest_wave_size);
298 const __m128i* sub_block = (__m128i*) (block + off);
299 __m128i* t_sub_block = (__m128i*)(target_block + off);
300
301 if (digest & mask) // XOR filtered sub-block
302 {
303 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
304 __m128i mA, mB, mC, mD;
305 mA = _mm_xor_si128(_mm_load_si128(sub_block),
306 _mm_load_si128(xor_sub_block));
307 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
308 _mm_load_si128(xor_sub_block+1));
309 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
310 _mm_load_si128(xor_sub_block+2));
311 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
312 _mm_load_si128(xor_sub_block+3));
313
314 _mm_store_si128(t_sub_block, mA);
315 _mm_store_si128(t_sub_block+1, mB);
316 _mm_store_si128(t_sub_block+2, mC);
317 _mm_store_si128(t_sub_block+3, mD);
318
319 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
320 _mm_load_si128(xor_sub_block+4));
321 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
322 _mm_load_si128(xor_sub_block+5));
323 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
324 _mm_load_si128(xor_sub_block+6));
325 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
326 _mm_load_si128(xor_sub_block+7));
327
328 _mm_store_si128(t_sub_block+4, mA);
329 _mm_store_si128(t_sub_block+5, mB);
330 _mm_store_si128(t_sub_block+6, mC);
331 _mm_store_si128(t_sub_block+7, mD);
332
333 }
334 else // just copy source
335 {
336 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
337 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
338 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
339 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
340
341 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
342 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
343 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
344 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
345 }
346 } // for i
347}
348
349/**
350 Build partial XOR product of 2 bit-blocks using digest mask
351
352 @param target_block - target ^= xor_block
353 @param xor_block - arg1
354 @param digest - mask for each block wave to XOR (if 1)
355
356 @ingroup SSE2
357 @internal
358*/
359inline
361 const bm::word_t* xor_block,
362 bm::id64_t digest) BMNOEXCEPT
363{
364 while (digest)
365 {
366 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
367 unsigned wave = bm::word_bitcount64(t - 1);
368 unsigned off = wave * bm::set_block_digest_wave_size;
369
370 const __m128i* sub_block = (const __m128i*) (xor_block + off);
371 __m128i* t_sub_block = (__m128i*)(target_block + off);
372
373 __m128i mA, mB, mC, mD;
374 mA = _mm_xor_si128(_mm_load_si128(sub_block),
375 _mm_load_si128(t_sub_block));
376 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
377 _mm_load_si128(t_sub_block+1));
378 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
379 _mm_load_si128(t_sub_block+2));
380 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
381 _mm_load_si128(t_sub_block+3));
382
383 _mm_store_si128(t_sub_block, mA);
384 _mm_store_si128(t_sub_block+1, mB);
385 _mm_store_si128(t_sub_block+2, mC);
386 _mm_store_si128(t_sub_block+3, mD);
387
388 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
389 _mm_load_si128(t_sub_block+4));
390 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
391 _mm_load_si128(t_sub_block+5));
392 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
393 _mm_load_si128(t_sub_block+6));
394 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
395 _mm_load_si128(t_sub_block+7));
396
397 _mm_store_si128(t_sub_block+4, mA);
398 _mm_store_si128(t_sub_block+5, mB);
399 _mm_store_si128(t_sub_block+6, mC);
400 _mm_store_si128(t_sub_block+7, mD);
401
402 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
403 } // while
404}
405
406
407
408/*!
409 @brief AND block digest stride
410 *dst &= *src
411 @return true if stide is all zero
412 @ingroup SSE2
413*/
415bool sse2_and_digest(__m128i* BMRESTRICT dst,
416 const __m128i* BMRESTRICT src) BMNOEXCEPT
417{
418 __m128i m1A, m1B, m1C, m1D;
419 const __m128i maskz = _mm_setzero_si128();
420
421 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
422 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
423 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
424 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
425
426 _mm_store_si128(dst+0, m1A);
427 _mm_store_si128(dst+1, m1B);
428 _mm_store_si128(dst+2, m1C);
429 _mm_store_si128(dst+3, m1D);
430
431 m1A = _mm_or_si128(m1A, m1B);
432 m1C = _mm_or_si128(m1C, m1D);
433 m1A = _mm_or_si128(m1A, m1C);
434
435 bool z1 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
436
437 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
438 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
439 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
440 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
441
442 _mm_store_si128(dst+4, m1A);
443 _mm_store_si128(dst+5, m1B);
444 _mm_store_si128(dst+6, m1C);
445 _mm_store_si128(dst+7, m1D);
446
447 m1A = _mm_or_si128(m1A, m1B);
448 m1C = _mm_or_si128(m1C, m1D);
449 m1A = _mm_or_si128(m1A, m1C);
450
451 bool z2 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
452
453 return z1 & z2;
454}
455
456/*!
457 @brief AND-OR block digest stride
458 *dst |= *src1 & src2
459
460 @return true if stide is all zero
461 @ingroup SSE2
462*/
465 const __m128i* BMRESTRICT src1,
466 const __m128i* BMRESTRICT src2) BMNOEXCEPT
467{
468 __m128i m1A, m1B, m1C, m1D;
469 __m128i mACC1;
470 const __m128i maskz = _mm_setzero_si128();
471
472 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
473 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
474 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
475 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
476
477 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
478 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
479
480 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
481 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
482 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
483 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
484
485 _mm_store_si128(dst+0, m1A);
486 _mm_store_si128(dst+1, m1B);
487 _mm_store_si128(dst+2, m1C);
488 _mm_store_si128(dst+3, m1D);
489
490
491 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
492 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
493 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
494 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
495
496 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
497 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
498
499 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
500 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
501 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
502 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
503
504 _mm_store_si128(dst+4, m1A);
505 _mm_store_si128(dst+5, m1B);
506 _mm_store_si128(dst+6, m1C);
507 _mm_store_si128(dst+7, m1D);
508
509 return z1 & z2;
510}
511
512
513/*!
514 @brief AND block digest stride
515 @return true if stide is all zero
516 @ingroup SSE2
517*/
518inline
520 const __m128i* BMRESTRICT src1,
521 const __m128i* BMRESTRICT src2,
522 const __m128i* BMRESTRICT src3,
523 const __m128i* BMRESTRICT src4) BMNOEXCEPT
524{
525 __m128i m1A, m1B, m1C, m1D;
526 __m128i m1E, m1F, m1G, m1H;
527
528 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
529 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
530 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
531 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
532
533 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
534 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
535 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
536 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
537
538 m1A = _mm_and_si128(m1A, m1E);
539 m1B = _mm_and_si128(m1B, m1F);
540 m1C = _mm_and_si128(m1C, m1G);
541 m1D = _mm_and_si128(m1D, m1H);
542
543 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
544 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
545 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
546 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
547
548 _mm_store_si128(dst+0, m1A);
549 _mm_store_si128(dst+1, m1B);
550 _mm_store_si128(dst+2, m1C);
551 _mm_store_si128(dst+3, m1D);
552
553 m1A = _mm_or_si128(m1A, m1B);
554 m1C = _mm_or_si128(m1C, m1D);
555 m1A = _mm_or_si128(m1A, m1C);
556
557 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
558
559 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
560 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
561 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
562 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
563
564 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
565 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
566 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
567 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
568
569 m1A = _mm_and_si128(m1A, m1E);
570 m1B = _mm_and_si128(m1B, m1F);
571 m1C = _mm_and_si128(m1C, m1G);
572 m1D = _mm_and_si128(m1D, m1H);
573
574 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
575 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
576 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
577 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
578
579 _mm_store_si128(dst+4, m1A);
580 _mm_store_si128(dst+5, m1B);
581 _mm_store_si128(dst+6, m1C);
582 _mm_store_si128(dst+7, m1D);
583
584 m1A = _mm_or_si128(m1A, m1B);
585 m1C = _mm_or_si128(m1C, m1D);
586 m1A = _mm_or_si128(m1A, m1C);
587
588 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
589
590 return z1 & z2;
591}
592
593
594/*!
595 @brief AND block digest stride
596 *dst = *src1 & src2
597
598 @return true if stide is all zero
599 @ingroup SSE2
600*/
603 const __m128i* BMRESTRICT src1,
604 const __m128i* BMRESTRICT src2) BMNOEXCEPT
605{
606 __m128i m1A, m1B, m1C, m1D;
607
608 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
609 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
610 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
611 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
612
613 _mm_store_si128(dst+0, m1A);
614 _mm_store_si128(dst+1, m1B);
615 _mm_store_si128(dst+2, m1C);
616 _mm_store_si128(dst+3, m1D);
617
618 m1A = _mm_or_si128(m1A, m1B);
619 m1C = _mm_or_si128(m1C, m1D);
620 m1A = _mm_or_si128(m1A, m1C);
621
622 const __m128i maskz = _mm_setzero_si128();
623 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
624
625 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
626 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
627 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
628 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
629
630 _mm_store_si128(dst+4, m1A);
631 _mm_store_si128(dst+5, m1B);
632 _mm_store_si128(dst+6, m1C);
633 _mm_store_si128(dst+7, m1D);
634
635 m1A = _mm_or_si128(m1A, m1B);
636 m1C = _mm_or_si128(m1C, m1D);
637 m1A = _mm_or_si128(m1A, m1C);
638
639 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
640
641 return z1 & z2;
642}
643
644/*!
645 @brief SUB (AND NOT) block digest stride
646 *dst &= ~*src
647
648 @return true if stide is all zero
649 @ingroup SSE2
650*/
652bool sse2_sub_digest(__m128i* BMRESTRICT dst,
653 const __m128i* BMRESTRICT src) BMNOEXCEPT
654{
655 __m128i m1A, m1B, m1C, m1D;
656 const __m128i maskz = _mm_setzero_si128();
657
658 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
659 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
660 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
661 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
662
663 _mm_store_si128(dst+0, m1A);
664 _mm_store_si128(dst+1, m1B);
665 _mm_store_si128(dst+2, m1C);
666 _mm_store_si128(dst+3, m1D);
667
668 m1A = _mm_or_si128(m1A, m1B);
669 m1C = _mm_or_si128(m1C, m1D);
670 m1A = _mm_or_si128(m1A, m1C);
671
672 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
673
674 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
675 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
676 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
677 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
678
679 _mm_store_si128(dst+4, m1A);
680 _mm_store_si128(dst+5, m1B);
681 _mm_store_si128(dst+6, m1C);
682 _mm_store_si128(dst+7, m1D);
683
684 m1A = _mm_or_si128(m1A, m1B);
685 m1C = _mm_or_si128(m1C, m1D);
686 m1A = _mm_or_si128(m1A, m1C);
687
688 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
689
690 return z1 & z2;
691}
692
693/*!
694 @brief 2-operand SUB (AND NOT) block digest stride
695 *dst = src1 & ~*src2
696
697 @return true if stide is all zero
698 @ingroup SSE2
699*/
702 const __m128i* BMRESTRICT src1,
703 const __m128i* BMRESTRICT src2) BMNOEXCEPT
704{
705 __m128i m1A, m1B, m1C, m1D;
706 const __m128i maskz = _mm_setzero_si128();
707
708 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
709 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
710 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
711 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
712
713 _mm_store_si128(dst+0, m1A);
714 _mm_store_si128(dst+1, m1B);
715 _mm_store_si128(dst+2, m1C);
716 _mm_store_si128(dst+3, m1D);
717
718 m1A = _mm_or_si128(m1A, m1B);
719 m1C = _mm_or_si128(m1C, m1D);
720 m1A = _mm_or_si128(m1A, m1C);
721
722 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
723
724 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
725 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
726 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
727 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
728
729 _mm_store_si128(dst+4, m1A);
730 _mm_store_si128(dst+5, m1B);
731 _mm_store_si128(dst+6, m1C);
732 _mm_store_si128(dst+7, m1D);
733
734 m1A = _mm_or_si128(m1A, m1B);
735 m1C = _mm_or_si128(m1C, m1D);
736 m1A = _mm_or_si128(m1A, m1C);
737
738 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
739
740 return z1 & z2;
741}
742
743
744/*!
745 \brief Find first non-zero bit
746 @ingroup SSE2
747*/
748inline
749bool sse2_bit_find_first(const __m128i* BMRESTRICT block,
750 unsigned* pos) BMNOEXCEPT
751{
752 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
753
754 const __m128i* block_end =
755 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
756 const __m128i maskZ = _mm_setzero_si128();
757 __m128i mA, mB;
758 unsigned simd_lane = 0;
759 int bsf;
760 do
761 {
762 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
763 __m128i mOR = _mm_or_si128(mA, mB);
764 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
765 if (!z1) // test 2x128 lanes
766 {
767 z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
768 if (!z1)
769 {
770 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
771 mask = ~~mask; // invert to find (w != 0)
772 BM_ASSERT(mask);
773 bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
774 _mm_store_si128 ((__m128i*)simd_buf, mA);
775 unsigned widx = bsf >> 2; // (bsf / 4);
776 unsigned w = simd_buf[widx];
777 bsf = bm::bit_scan_forward32(w); // find first bit != 0
778 *pos = (simd_lane * 128) + (widx * 32) + bsf;
779 return true;
780 }
781 unsigned mask = (_mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ)));
782 mask = ~~mask; // invert to find (w != 0)
783 BM_ASSERT(mask);
784 bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
785 _mm_store_si128 ((__m128i*)simd_buf, mB);
786 unsigned widx = bsf >> 2; // (bsf / 4);
787 unsigned w = simd_buf[widx];
788 bsf = bm::bit_scan_forward32(w); // find first bit != 0
789 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
790 return true;
791 }
792 simd_lane+=2;
793 block+=2;
794 } while (block < block_end);
795
796 return false;
797}
798
799/*!
800 \brief Find first bit which is different between two bit-blocks
801 @ingroup SSE2
802*/
803inline
804bool sse2_bit_find_first_diff(const __m128i* BMRESTRICT block1,
805 const __m128i* BMRESTRICT block2,
806 unsigned* pos) BMNOEXCEPT
807{
808 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
809
810 const __m128i* block1_end =
811 (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
812 const __m128i maskZ = _mm_setzero_si128();
813 __m128i mA, mB;
814 unsigned simd_lane = 0;
815 do
816 {
817 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
818 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
819 __m128i mOR = _mm_or_si128(mA, mB);
820 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
821 if (!z1) // test 2x128 lanes
822 {
823 z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
824 if (!z1) // test 2x128 lanes
825 {
826 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
827 mask = ~~mask; // invert to find (w != 0)
828 BM_ASSERT(mask);
829 int bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
830 _mm_store_si128 ((__m128i*)simd_buf, mA);
831 unsigned widx = bsf >> 2; // (bsf / 4);
832 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
833 bsf = bm::bit_scan_forward32(w); // find first bit != 0
834 *pos = (simd_lane * 128) + (widx * 32) + bsf;
835 return true;
836 }
837 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
838 mask = ~~mask; // invert to find (w != 0)
839 BM_ASSERT(mask);
840 int bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
841 _mm_store_si128 ((__m128i*)simd_buf, mB);
842 unsigned widx = bsf >> 2; // (bsf / 4);
843 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
844 bsf = bm::bit_scan_forward32(w); // find first bit != 0
845 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
846 return true;
847 }
848 simd_lane+=2;
849 block1+=2; block2+=2;
850 } while (block1 < block1_end);
851 return false;
852}
853
854/*
855Snippets to extract32 in SSE2:
856
857inline int get_x(const __m128i& vec){return _mm_cvtsi128_si32 (vec);}
858inline int get_y(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0x55));}
859inline int get_z(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0xAA));}
860inline int get_w(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0xFF));}
861*/
862
863/*!
864 @brief block shift right by 1
865 @ingroup SSE2
866*/
867inline
868bool sse2_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
869{
870 __m128i* block_end =
871 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
872 __m128i m1COshft, m2COshft;
873 __m128i mAcc = _mm_set1_epi32(0);
874
875 __m128i mMask0 = _mm_set_epi32(-1,-1,-1, 0);
876
877 unsigned co2;
878 for (;block < block_end; block += 2)
879 {
880 __m128i m1A = _mm_load_si128(block);
881 __m128i m2A = _mm_load_si128(block+1);
882
883 __m128i m1CO = _mm_srli_epi32(m1A, 31);
884 __m128i m2CO = _mm_srli_epi32(m2A, 31);
885
886 co2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m1CO, 0xFF));
887
888 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
889 m2A = _mm_slli_epi32(m2A, 1);
890
891 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
892 m2COshft = _mm_slli_si128 (m2CO, 4);
893
894 m1COshft = _mm_and_si128(m1COshft, mMask0); // clear the vec[0]
895 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(0, 0, 0, co1)); // vec[0] = co1
896
897 m2COshft = _mm_and_si128(m2COshft, mMask0); // clear the vec[0]
898 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(0, 0, 0, co2)); // vec[0] = co2
899
900 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
901 m2A = _mm_or_si128(m2A, m2COshft);
902
903 co1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m2CO, 0xFF));
904
905 _mm_store_si128(block, m1A);
906 _mm_store_si128(block+1, m2A);
907
908 mAcc = _mm_or_si128(mAcc, m1A);
909 mAcc = _mm_or_si128(mAcc, m2A);
910 }
911 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
912 *empty_acc = !z1;
913 return co1;
914}
915
916/*!
917 @brief block shift left by 1
918 @ingroup SSE2
919*/
920inline
921bool sse2_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
922{
923 __m128i* block_end =
924 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
925 __m128i mAcc = _mm_set1_epi32(0);
926 __m128i mMask1 = _mm_set1_epi32(1);
927 __m128i mMask0 = _mm_set_epi32(0, -1, -1, -1);
928
929 unsigned co2;
930 for (--block_end; block_end >= block; block_end -= 2)
931 {
932 __m128i m1A = _mm_load_si128(block_end);
933 __m128i m2A = _mm_load_si128(block_end-1);
934
935 __m128i m1CO = _mm_and_si128(m1A, mMask1);
936 __m128i m2CO = _mm_and_si128(m2A, mMask1);
937
938 co2 = _mm_cvtsi128_si32 (m1CO); // get vec[0]
939
940 m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
941 m2A = _mm_srli_epi32(m2A, 1);
942
943 __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
944 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
945
946 // m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
947 // m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
948 m1COshft = _mm_and_si128(m1COshft, mMask0); // clear the vec[0]
949 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(co1, 0, 0, 0)); // vec[3] = co1
950 m2COshft = _mm_and_si128(m2COshft, mMask0); // clear the vec[0]
951 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(co2, 0, 0, 0)); // vec[3] = co2
952
953
954 m1COshft = _mm_slli_epi32(m1COshft, 31);
955 m2COshft = _mm_slli_epi32(m2COshft, 31);
956
957 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
958 m2A = _mm_or_si128(m2A, m2COshft);
959
960 co1 = _mm_cvtsi128_si32 (m2CO); // get vec[0]
961
962 _mm_store_si128(block_end, m1A);
963 _mm_store_si128(block_end-1, m2A);
964
965 mAcc = _mm_or_si128(mAcc, m1A);
966 mAcc = _mm_or_si128(mAcc, m2A);
967 } // for
968
969 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
970 *empty_acc = !z1; // !_mm_testz_si128(mAcc, mAcc);
971 return co1;
972}
973
974
975
976inline
978 const __m128i* BMRESTRICT block_end,
979 unsigned* BMRESTRICT bit_count)
980{
981 const unsigned mu1 = 0x55555555;
982 const unsigned mu2 = 0x33333333;
983 const unsigned mu3 = 0x0F0F0F0F;
984 const unsigned mu4 = 0x0000003F;
985
986 // Loading masks
987 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
988 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
989 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
990 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
991 __m128i mcnt;//, ccnt;
992 mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
993 //ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
994
995 __m128i tmp1, tmp2;
996
997 int count = (int)(block_end - block)*4; //0;//1;
998
999 bm::word_t w, w0, w_prev;//, w_l;
1000 const int w_shift = sizeof(w) * 8 - 1;
1001 bool first_word = true;
1002
1003 // first word
1004 {
1005 const bm::word_t* blk = (const bm::word_t*) block;
1006 w = w0 = blk[0];
1007 w ^= (w >> 1);
1008 count += bm::word_bitcount(w);
1009 count -= (w_prev = (w0 >> w_shift)); // negative value correction
1010 }
1011
1013
1014 do
1015 {
1016 // compute bit-count
1017 // ---------------------------------------------------------------------
1018 {
1019 __m128i b = _mm_load_si128(block);
1020
1021 // w ^(w >> 1)
1022 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
1023 tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
1024 _mm_store_si128((__m128i*)tcnt, tmp2);
1025
1026
1027 // compare with zero
1028 // SSE4: _mm_test_all_zero()
1029 {
1030 // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
1031 //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
1032 tmp1 = _mm_and_si128(tmp1, m1);
1033 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
1034 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
1035
1036 // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
1037 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
1038 tmp1 = _mm_and_si128(tmp1, m2);
1039 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
1040 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
1041
1042 // b = (b + (b >> 4)) & 0x0F0F0F0F;
1043 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
1044 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
1045 b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
1046
1047 // b = b + (b >> 8);
1048 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
1049 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
1050
1051 // b = (b + (b >> 16)) & 0x0000003F;
1052 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
1053 b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
1054 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
1055
1056 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
1057 }
1058
1059 }
1060 // ---------------------------------------------------------------------
1061 {
1062 //__m128i b = _mm_load_si128(block);
1063 // TODO: SSE4...
1064 //w = _mm_extract_epi32(b, i);
1065
1066 const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
1067
1068 if (first_word)
1069 {
1070 first_word = false;
1071 }
1072 else
1073 {
1074 if (0!=(w0=blk[0]))
1075 {
1076 count += bm::word_bitcount(tcnt[0]);
1077 count -= !(w_prev ^ (w0 & 1));
1078 count -= w_prev = (w0 >> w_shift);
1079 }
1080 else
1081 {
1082 count -= !w_prev; w_prev ^= w_prev;
1083 }
1084 }
1085 if (0!=(w0=blk[1]))
1086 {
1087 count += bm::word_bitcount(tcnt[1]);
1088 count -= !(w_prev ^ (w0 & 1));
1089 count -= w_prev = (w0 >> w_shift);
1090 }
1091 else
1092 {
1093 count -= !w_prev; w_prev ^= w_prev;
1094 }
1095 if (0!=(w0=blk[2]))
1096 {
1097 count += bm::word_bitcount(tcnt[2]);
1098 count -= !(w_prev ^ (w0 & 1));
1099 count -= w_prev = (w0 >> w_shift);
1100 }
1101 else
1102 {
1103 count -= !w_prev; w_prev ^= w_prev;
1104 }
1105 if (0!=(w0=blk[3]))
1106 {
1107 count += bm::word_bitcount(tcnt[3]);
1108 count -= !(w_prev ^ (w0 & 1));
1109 count -= w_prev = (w0 >> w_shift);
1110 }
1111 else
1112 {
1113 count -= !w_prev; w_prev ^= w_prev;
1114 }
1115 }
1116 } while (++block < block_end);
1117
1118 _mm_store_si128((__m128i*)tcnt, mcnt);
1119 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
1120
1121 return unsigned(count);
1122}
1123
1124#ifdef __GNUG__
1125// necessary measure to silence false warning from GCC about negative pointer arithmetics
1126#pragma GCC diagnostic push
1127#pragma GCC diagnostic ignored "-Warray-bounds"
1128#endif
1129
1130/*!
1131SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
1132\internal
1133*/
1134inline
1135unsigned sse2_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
1136{
1137 BM_ASSERT(size <= 16);
1138 BM_ASSERT(size);
1139
1140 const unsigned unroll_factor = 8;
1141 if (size < 4) // for very short vector use conventional scan
1142 {
1143 unsigned j;
1144 for (j = 0; j < size; ++j)
1145 {
1146 if (pbuf[j] >= pos)
1147 break;
1148 }
1149 return j;
1150 }
1151
1152 __m128i m1, mz, maskF, maskFL;
1153
1154 mz = _mm_setzero_si128();
1155 m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1156
1157 maskF = _mm_cmpeq_epi32(mz, mz); // set all FF
1158 maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
1159 int shiftL = (64 - (unroll_factor - size) * 16);
1160 maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1161
1162 m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1163 m1 = _mm_or_si128(m1, maskFL);
1164
1165 __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1166 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1167 int mi = _mm_movemask_epi8(mge_mask); // collect flag bits
1168 if (mi)
1169 {
1170 int bsr_i= bm::bit_scan_fwd(mi) >> 1;
1171 return bsr_i; // address of first one element (target)
1172 }
1173 if (size == 8)
1174 return size;
1175
1176 // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1177 // GCC gives a false warning for "- unroll_factor" here
1178 const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1179 BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false
1180
1181 m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1182 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1183 mi = _mm_movemask_epi8(mge_mask);
1184 if (mi)
1185 {
1186 int bsr_i = bm::bit_scan_fwd(mi) >> 1;
1187 return size - (unroll_factor - bsr_i);
1188 }
1189 return size;
1190}
1191
1192/**
1193 Hybrid binary search, starts as binary, then switches to linear scan
1194
1195 \param buf - GAP buffer pointer.
1196 \param pos - index of the element.
1197 \param is_set - output. GAP value (0 or 1).
1198 \return GAP index.
1199
1200 @ingroup SSE2
1201*/
1202inline
1203unsigned sse2_gap_bfind(const unsigned short* BMRESTRICT buf,
1204 unsigned pos, unsigned* BMRESTRICT is_set)
1205{
1206 unsigned start = 1;
1207 unsigned end = 1 + ((*buf) >> 3);
1208 unsigned dsize = end - start;
1209
1210 if (dsize < 17)
1211 {
1212 start = bm::sse2_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
1213 *is_set = ((*buf) & 1) ^ (start & 1);
1214 BM_ASSERT(buf[start+1] >= pos);
1215 BM_ASSERT(buf[start] < pos || (start==0));
1216
1217 return start+1;
1218 }
1219 unsigned arr_end = end;
1220 while (start != end)
1221 {
1222 unsigned curr = (start + end) >> 1;
1223 if (buf[curr] < pos)
1224 start = curr + 1;
1225 else
1226 end = curr;
1227
1228 unsigned size = end - start;
1229 if (size < 16)
1230 {
1231 size += (end != arr_end);
1232 unsigned idx =
1233 bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size);
1234 start += idx;
1235
1236 BM_ASSERT(buf[start] >= pos);
1237 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1238 break;
1239 }
1240 }
1241
1242 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1243 return start;
1244}
1245
1246/**
1247 Hybrid binary search, starts as binary, then switches to scan
1248 @ingroup SSE2
1249*/
1250inline
1251unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
1252{
1253 unsigned is_set;
1254 bm::sse2_gap_bfind(buf, pos, &is_set);
1255 return is_set;
1256}
1257
1258
1259
1260
1261#ifdef __GNUG__
1262#pragma GCC diagnostic pop
1263#endif
1264
1265
1266#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1267 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1268
1269#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1270 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1271
1272#define VECT_BITCOUNT(first, last) \
1273 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
1274
1275#define VECT_BITCOUNT_AND(first, last, mask) \
1276 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1277
1278#define VECT_BITCOUNT_OR(first, last, mask) \
1279 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1280
1281#define VECT_BITCOUNT_XOR(first, last, mask) \
1282 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1283
1284#define VECT_BITCOUNT_SUB(first, last, mask) \
1285 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1286
1287#define VECT_INVERT_BLOCK(first) \
1288 sse2_invert_block((__m128i*)first);
1289
1290#define VECT_AND_BLOCK(dst, src) \
1291 sse2_and_block((__m128i*) dst, (__m128i*) (src))
1292
1293#define VECT_AND_DIGEST(dst, src) \
1294 sse2_and_digest((__m128i*) dst, (const __m128i*) (src))
1295
1296#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1297 sse2_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1298
1299#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1300 sse2_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1301
1302#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1303 sse2_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1304
1305#define VECT_OR_BLOCK(dst, src) \
1306 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1307
1308#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1309 sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1310
1311#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1312 sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1313
1314#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1315 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1316
1317#define VECT_SUB_BLOCK(dst, src) \
1318 sse2_sub_block((__m128i*) dst, (__m128i*) (src))
1319
1320#define VECT_SUB_DIGEST(dst, src) \
1321 sse2_sub_digest((__m128i*) dst, (const __m128i*) (src))
1322
1323#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1324 sse2_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1325
1326#define VECT_XOR_BLOCK(dst, src) \
1327 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1328
1329#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1330 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1331
1332#define VECT_COPY_BLOCK(dst, src) \
1333 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1334
1335#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1336 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1337
1338#define VECT_STREAM_BLOCK(dst, src) \
1339 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1340
1341#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1342 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1343
1344#define VECT_SET_BLOCK(dst, value) \
1345 sse2_set_block((__m128i*) dst, value)
1346
1347#define VECT_IS_ZERO_BLOCK(dst) \
1348 sse2_is_all_zero((__m128i*) dst)
1349
1350#define VECT_IS_ONE_BLOCK(dst) \
1351 sse2_is_all_one((__m128i*) dst)
1352
1353#define VECT_IS_DIGEST_ZERO(start) \
1354 sse2_is_digest_zero((__m128i*)start)
1355
1356#define VECT_BLOCK_SET_DIGEST(dst, val) \
1357 sse2_block_set_digest((__m128i*)dst, val)
1358
1359#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1360 sse2_lower_bound_scan_u32(arr, target, from, to)
1361
1362#define VECT_SHIFT_R1(b, acc, co) \
1363 sse2_shift_r1((__m128i*)b, acc, co)
1364
1365
1366#define VECT_BIT_FIND_FIRST(src, pos) \
1367 sse2_bit_find_first((__m128i*) src, pos)
1368
1369#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1370 sse2_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1371
1372#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1373 sse2_bit_block_xor(t, src, src_xor, d)
1374
1375#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1376 sse2_bit_block_xor_2way(t, src_xor, d)
1377
1378#define VECT_GAP_BFIND(buf, pos, is_set) \
1379 sse2_gap_bfind(buf, pos, is_set)
1380
1381} // namespace
1382
1383
1384#ifdef __GNUG__
1385#pragma GCC diagnostic pop
1386#endif
1387
1388
1389#endif
Definitions(internal)
#define BM_ALIGN16
Definition: bmdef.h:287
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN32
Definition: bmdef.h:292
#define BM_ALIGN16ATTR
Definition: bmdef.h:288
#define BMFORCEINLINE
Definition: bmdef.h:213
#define BM_ASSERT
Definition: bmdef.h:139
#define BM_ALIGN32ATTR
Definition: bmdef.h:293
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse2.h:66
BMFORCEINLINE void sse2_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse2.h:268
BMFORCEINLINE bool sse2_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
Definition: bmsse2.h:415
void sse2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse2.h:360
bool sse2_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
Definition: bmsse2.h:804
bool sse2_bit_find_first(const __m128i *BMRESTRICT block, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
Definition: bmsse2.h:749
bool sse2_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
Definition: bmsse2.h:868
bool sse2_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
Definition: bmsse2.h:219
bool sse2_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
Definition: bmsse2.h:519
unsigned sse2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition: bmsse2.h:1203
unsigned sse2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmsse2.h:1251
BMFORCEINLINE bool sse2_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
Definition: bmsse2.h:464
BMFORCEINLINE bool sse2_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
Definition: bmsse2.h:244
void sse2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse2.h:289
bool sse2_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
Definition: bmsse2.h:194
BMFORCEINLINE bool sse2_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
Definition: bmsse2.h:602
BMFORCEINLINE bool sse2_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse2.h:652
bool sse2_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
Definition: bmsse2.h:921
BMFORCEINLINE bool sse2_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse2.h:701
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition: bmutil.h:573
BMFORCEINLINE unsigned word_bitcount64(bm::id64_t x) BMNOEXCEPT
Definition: bmutil.h:596
Definition: bm.h:78
bm::id_t sse2_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
Definition: bmsse2.h:977
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
unsigned int word_t
Definition: bmconst.h:39
unsigned sse2_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse2.h:1135
BMFORCEINLINE unsigned bit_scan_forward32(unsigned w) BMNOEXCEPT
Definition: bmutil.h:319
BMFORCEINLINE T bit_scan_fwd(T v) BMNOEXCEPT
Definition: bmutil.h:297
bm::id_t sse2_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse2.h:127
const unsigned set_block_size
Definition: bmconst.h:55
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
unsigned int id_t
Definition: bmconst.h:38
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition: bmutil.h:335
unsigned short gap_word_t
Definition: bmconst.h:78
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345