BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1#ifndef BMSSE4__H__INCLUDED__
2#define BMSSE4__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse4.h
22 \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23
24 Aside from SSE4.2 it also compiles in WASM SIMD mode
25 for 128-bit SIMD portable target.
26*/
27
28#ifndef BMWASMSIMDOPT
29#include<mmintrin.h>
30#endif
31#include<emmintrin.h>
32#include<smmintrin.h>
33#include<nmmintrin.h>
34#include<immintrin.h>
35
36#include "bmdef.h"
37#include "bmutil.h"
38#include "bmsse_util.h"
39
40namespace bm
41{
42
43/** @defgroup SSE4 SSE4.2 funcions (internal)
44 Processor specific optimizations for SSE4.2 instructions (internals)
45 @internal
46 @ingroup bvector
47 */
48
49#ifdef __GNUG__
50#pragma GCC diagnostic push
51#pragma GCC diagnostic ignored "-Wconversion"
52#endif
53
54#ifdef _MSC_VER
55#pragma warning( push )
56#pragma warning( disable : 4146)
57#endif
58
59
60// WASM build: define missing POPCNT intrinsics via GCC build-ins
61#ifdef BMWASMSIMDOPT
62# define _mm_popcnt_u32 __builtin_popcount
63# define _mm_popcnt_u64 __builtin_popcountll
64# define BM_BSF32 __builtin_ctz
65#else
66# define BM_BSF32 bm::bsf_asm32
67#endif
68
69
70/*
71inline
72void sse2_print128(const char* prefix, const __m128i & value)
73{
74 const size_t n = sizeof(__m128i) / sizeof(unsigned);
75 unsigned buffer[n];
76 _mm_storeu_si128((__m128i*)buffer, value);
77 std::cout << prefix << " [ ";
78 for (int i = n-1; 1; --i)
79 {
80 std::cout << buffer[i] << " ";
81 if (i == 0)
82 break;
83 }
84 std::cout << "]" << std::endl;
85}
86*/
87
88/*!
89 SSE4.2 optimized bitcounting .
90 @ingroup SSE4
91*/
92inline
93bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end) BMNOEXCEPT
94{
95 bm::id_t count = 0;
96#ifdef BM64_SSE4
97 const bm::id64_t* b = (bm::id64_t*) block;
98 const bm::id64_t* b_end = (bm::id64_t*) block_end;
99 do
100 {
101 count += unsigned( _mm_popcnt_u64(b[0]) +
102 _mm_popcnt_u64(b[1]) +
103 _mm_popcnt_u64(b[2]) +
104 _mm_popcnt_u64(b[3]));
105 b += 4;
106 } while (b < b_end);
107#else
108 do
109 {
110 const unsigned* b = (unsigned*) block;
111 count += _mm_popcnt_u32(b[0]) +
112 _mm_popcnt_u32(b[1]) +
113 _mm_popcnt_u32(b[2]) +
114 _mm_popcnt_u32(b[3]);
115 } while (++block < block_end);
116#endif
117 return count;
118}
119
120
121
122/*!
123\internal
124*/
126unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT
127{
128 unsigned ret = (a ^ b);
129 return ret;
130}
131
132/*!
133\internal
134*/
136unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
137{
138 return (a | b);
139}
140
141/*!
142\internal
143*/
145unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
146{
147 return (a & b);
148}
149
150
151template<class Func>
153 const __m128i* BMRESTRICT block_end,
154 const __m128i* BMRESTRICT mask_block,
155 Func sse2_func) BMNOEXCEPT
156{
157 bm::id_t count = 0;
158#ifdef BM64_SSE4
160 do
161 {
162 __m128i b = sse2_func(_mm_load_si128(block), _mm_load_si128(mask_block));
163 _mm_store_si128((__m128i*)tcnt, b);
164 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
165
166 b = sse2_func(_mm_load_si128(block+1), _mm_load_si128(mask_block+1));
167 _mm_store_si128((__m128i*)tcnt, b);
168 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
169 block+=2; mask_block+=2;
170 } while (block < block_end);
171#else
172 do
173 {
174 __m128i tmp0 = _mm_load_si128(block);
175 __m128i tmp1 = _mm_load_si128(mask_block);
176 __m128i b = sse2_func(tmp0, tmp1);
177
178 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
179 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
180 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
181 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
182
183 ++block; ++mask_block;
184 } while (block < block_end);
185#endif
186
187 return count;
188}
189
190/*!
191 @brief check if block is all zero bits
192 @ingroup SSE4
193*/
194inline
195bool sse4_is_all_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
196{
197 __m128i w;
198 __m128i maskz = _mm_setzero_si128();
199 const __m128i* BMRESTRICT block_end =
200 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
201
202 do
203 {
204 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
205 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
206 return false;
207 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
208 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
209 return false;
210 block += 4;
211 } while (block < block_end);
212 return true;
213}
214
215/*!
216 @brief check if digest stride is all zero bits
217 @ingroup SSE4
218*/
220bool sse4_is_digest_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
221{
222 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
223 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
224 wA = _mm_or_si128(wA, wB);
225 bool z1 = _mm_test_all_zeros(wA, wA);
226
227 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
228 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
229 wA = _mm_or_si128(wA, wB);
230 bool z2 = _mm_test_all_zeros(wA, wA);
231 return z1 & z2;
232}
233
234/*!
235 @brief set digest stride to 0xFF.. or 0x0 value
236 @ingroup SSE4
237*/
239void sse4_block_set_digest(__m128i* dst, unsigned value) BMNOEXCEPT
240{
241 __m128i mV = _mm_set1_epi32(int(value));
242 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
243 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
244 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
245 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
246}
247
248
249/*!
250 @brief AND blocks2
251 *dst &= *src
252
253 @return 0 if no bits were set
254 @ingroup SSE4
255*/
256inline
257unsigned sse4_and_block(__m128i* BMRESTRICT dst,
258 const __m128i* BMRESTRICT src) BMNOEXCEPT
259{
260 __m128i m1A, m1B, m1C, m1D;
261 __m128i accA, accB, accC, accD;
262
263 const __m128i* BMRESTRICT src_end =
264 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
265
266 accA = accB = accC = accD = _mm_setzero_si128();
267
268 do
269 {
270 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
271 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
272 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
273 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
274
275 _mm_store_si128(dst+0, m1A);
276 _mm_store_si128(dst+1, m1B);
277 _mm_store_si128(dst+2, m1C);
278 _mm_store_si128(dst+3, m1D);
279
280 accA = _mm_or_si128(accA, m1A);
281 accB = _mm_or_si128(accB, m1B);
282 accC = _mm_or_si128(accC, m1C);
283 accD = _mm_or_si128(accD, m1D);
284
285 src += 4; dst += 4;
286 } while (src < src_end);
287
288 accA = _mm_or_si128(accA, accB); // A = A | B
289 accC = _mm_or_si128(accC, accD); // C = C | D
290 accA = _mm_or_si128(accA, accC); // A = A | C
291
292 return !_mm_testz_si128(accA, accA);
293}
294
295
296/*!
297 @brief AND block digest stride
298 *dst &= *src
299
300 @return true if stide is all zero
301 @ingroup SSE4
302*/
304bool sse4_and_digest(__m128i* BMRESTRICT dst,
305 const __m128i* BMRESTRICT src) BMNOEXCEPT
306{
307 __m128i m1A, m1B, m1C, m1D;
308
309 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
310 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
311 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
312 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
313
314 _mm_store_si128(dst+0, m1A);
315 _mm_store_si128(dst+1, m1B);
316 _mm_store_si128(dst+2, m1C);
317 _mm_store_si128(dst+3, m1D);
318
319 m1A = _mm_or_si128(m1A, m1B);
320 m1C = _mm_or_si128(m1C, m1D);
321 m1A = _mm_or_si128(m1A, m1C);
322
323 bool z1 = _mm_testz_si128(m1A, m1A);
324
325 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
326 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
327 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
328 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
329
330 _mm_store_si128(dst+4, m1A);
331 _mm_store_si128(dst+5, m1B);
332 _mm_store_si128(dst+6, m1C);
333 _mm_store_si128(dst+7, m1D);
334
335 m1A = _mm_or_si128(m1A, m1B);
336 m1C = _mm_or_si128(m1C, m1D);
337 m1A = _mm_or_si128(m1A, m1C);
338
339 bool z2 = _mm_testz_si128(m1A, m1A);
340
341 return z1 & z2;
342}
343
344/*!
345 @brief AND block digest stride
346 *dst = *src1 & src2
347
348 @return true if stide is all zero
349 @ingroup SSE4
350*/
353 const __m128i* BMRESTRICT src1,
354 const __m128i* BMRESTRICT src2) BMNOEXCEPT
355{
356 __m128i m1A, m1B, m1C, m1D;
357
358 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
359 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
360 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
361 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
362
363 _mm_store_si128(dst+0, m1A);
364 _mm_store_si128(dst+1, m1B);
365 _mm_store_si128(dst+2, m1C);
366 _mm_store_si128(dst+3, m1D);
367
368 m1A = _mm_or_si128(m1A, m1B);
369 m1C = _mm_or_si128(m1C, m1D);
370 m1A = _mm_or_si128(m1A, m1C);
371
372 bool z1 = _mm_testz_si128(m1A, m1A);
373
374 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
375 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
376 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
377 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
378
379 _mm_store_si128(dst+4, m1A);
380 _mm_store_si128(dst+5, m1B);
381 _mm_store_si128(dst+6, m1C);
382 _mm_store_si128(dst+7, m1D);
383
384 m1A = _mm_or_si128(m1A, m1B);
385 m1C = _mm_or_si128(m1C, m1D);
386 m1A = _mm_or_si128(m1A, m1C);
387
388 bool z2 = _mm_testz_si128(m1A, m1A);
389
390 return z1 & z2;
391}
392
393/*!
394 @brief AND-OR block digest stride
395 *dst |= *src1 & src2
396
397 @return true if stide is all zero
398 @ingroup SSE4
399*/
400inline
402 const __m128i* BMRESTRICT src1,
403 const __m128i* BMRESTRICT src2) BMNOEXCEPT
404{
405 __m128i m1A, m1B, m1C, m1D;
406 __m128i mACC1;
407
408 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
409 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
410 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
411 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
412
413 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
414 bool z1 = _mm_testz_si128(mACC1, mACC1);
415
416 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
417 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
418 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
419 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
420
421 _mm_store_si128(dst+0, m1A);
422 _mm_store_si128(dst+1, m1B);
423 _mm_store_si128(dst+2, m1C);
424 _mm_store_si128(dst+3, m1D);
425
426
427 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
428 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
429 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
430 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
431
432 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
433 bool z2 = _mm_testz_si128(mACC1, mACC1);
434
435 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
436 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
437 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
438 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
439
440 _mm_store_si128(dst+4, m1A);
441 _mm_store_si128(dst+5, m1B);
442 _mm_store_si128(dst+6, m1C);
443 _mm_store_si128(dst+7, m1D);
444
445 return z1 & z2;
446}
447
448
449/*!
450 @brief AND block digest stride
451 @return true if stide is all zero
452 @ingroup SSE4
453*/
454inline
456 const __m128i* BMRESTRICT src1,
457 const __m128i* BMRESTRICT src2,
458 const __m128i* BMRESTRICT src3,
459 const __m128i* BMRESTRICT src4) BMNOEXCEPT
460{
461 __m128i m1A, m1B, m1C, m1D;
462 __m128i m1E, m1F, m1G, m1H;
463
464 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
465 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
466 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
467 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
468
469 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
470 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
471 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
472 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
473
474 m1A = _mm_and_si128(m1A, m1E);
475 m1B = _mm_and_si128(m1B, m1F);
476 m1C = _mm_and_si128(m1C, m1G);
477 m1D = _mm_and_si128(m1D, m1H);
478
479 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
480 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
481 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
482 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
483
484 _mm_store_si128(dst+0, m1A);
485 _mm_store_si128(dst+1, m1B);
486 _mm_store_si128(dst+2, m1C);
487 _mm_store_si128(dst+3, m1D);
488
489 m1A = _mm_or_si128(m1A, m1B);
490 m1C = _mm_or_si128(m1C, m1D);
491 m1A = _mm_or_si128(m1A, m1C);
492
493 bool z1 = _mm_testz_si128(m1A, m1A);
494
495 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
496 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
497 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
498 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
499
500 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
501 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
502 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
503 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
504
505 m1A = _mm_and_si128(m1A, m1E);
506 m1B = _mm_and_si128(m1B, m1F);
507 m1C = _mm_and_si128(m1C, m1G);
508 m1D = _mm_and_si128(m1D, m1H);
509
510 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
511 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
512 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
513 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
514
515 _mm_store_si128(dst+4, m1A);
516 _mm_store_si128(dst+5, m1B);
517 _mm_store_si128(dst+6, m1C);
518 _mm_store_si128(dst+7, m1D);
519
520 m1A = _mm_or_si128(m1A, m1B);
521 m1C = _mm_or_si128(m1C, m1D);
522 m1A = _mm_or_si128(m1A, m1C);
523
524 bool z2 = _mm_testz_si128(m1A, m1A);
525
526 return z1 & z2;
527}
528
529
530/*!
531 @brief SUB (AND NOT) block digest stride
532 *dst &= ~*src
533
534 @return true if stide is all zero
535 @ingroup SSE4
536*/
538bool sse4_sub_digest(__m128i* BMRESTRICT dst,
539 const __m128i* BMRESTRICT src) BMNOEXCEPT
540{
541 __m128i m1A, m1B, m1C, m1D;
542
543 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
544 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
545 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
546 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
547
548 _mm_store_si128(dst+0, m1A);
549 _mm_store_si128(dst+1, m1B);
550 _mm_store_si128(dst+2, m1C);
551 _mm_store_si128(dst+3, m1D);
552
553 m1A = _mm_or_si128(m1A, m1B);
554 m1C = _mm_or_si128(m1C, m1D);
555 m1A = _mm_or_si128(m1A, m1C);
556
557 bool z1 = _mm_testz_si128(m1A, m1A);
558
559 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
560 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
561 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
562 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
563
564 _mm_store_si128(dst+4, m1A);
565 _mm_store_si128(dst+5, m1B);
566 _mm_store_si128(dst+6, m1C);
567 _mm_store_si128(dst+7, m1D);
568
569 m1A = _mm_or_si128(m1A, m1B);
570 m1C = _mm_or_si128(m1C, m1D);
571 m1A = _mm_or_si128(m1A, m1C);
572
573 bool z2 = _mm_testz_si128(m1A, m1A);
574
575 return z1 & z2;
576}
577
578
579/*!
580 @brief 2-operand SUB (AND NOT) block digest stride
581 *dst = src1 & ~*src2
582
583 @return true if stide is all zero
584 @ingroup SSE4
585*/
588 const __m128i* BMRESTRICT src1,
589 const __m128i* BMRESTRICT src2) BMNOEXCEPT
590{
591 __m128i m1A, m1B, m1C, m1D;
592
593 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
594 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
595 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
596 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
597
598 _mm_store_si128(dst+0, m1A);
599 _mm_store_si128(dst+1, m1B);
600 _mm_store_si128(dst+2, m1C);
601 _mm_store_si128(dst+3, m1D);
602
603 m1A = _mm_or_si128(m1A, m1B);
604 m1C = _mm_or_si128(m1C, m1D);
605 m1A = _mm_or_si128(m1A, m1C);
606
607 bool z1 = _mm_testz_si128(m1A, m1A);
608
609 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
610 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
611 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
612 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
613
614 _mm_store_si128(dst+4, m1A);
615 _mm_store_si128(dst+5, m1B);
616 _mm_store_si128(dst+6, m1C);
617 _mm_store_si128(dst+7, m1D);
618
619 m1A = _mm_or_si128(m1A, m1B);
620 m1C = _mm_or_si128(m1C, m1D);
621 m1A = _mm_or_si128(m1A, m1C);
622
623 bool z2 = _mm_testz_si128(m1A, m1A);
624
625 return z1 & z2;
626}
627
628
629
630/*!
631 @brief check if block is all ONE bits
632 @ingroup SSE4
633*/
634inline
635bool sse4_is_all_one(const __m128i* BMRESTRICT block) BMNOEXCEPT
636{
637 __m128i w;
638 const __m128i* BMRESTRICT block_end =
639 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
640
641 do
642 {
643 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
644 if (!_mm_test_all_ones(w))
645 return false;
646 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
647 if (!_mm_test_all_ones(w))
648 return false;
649
650 block+=4;
651 } while (block < block_end);
652 return true;
653}
654
655/*!
656 @brief check if SSE wave is all oxFFFF...FFF
657 @ingroup SSE4
658*/
661{
662 return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
663}
664
665
666/*!
667 @brief check if wave of pointers is all NULL
668 @ingroup SSE4
669*/
672{
673 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
674 return _mm_testz_si128(w0, w0);
675}
676
677/*!
678 @brief check if 2 waves of pointers are all NULL
679 @ingroup SSE4
680*/
682bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1) BMNOEXCEPT
683{
684 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
685 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
686 w0 = _mm_or_si128(w0, w1);
687 return _mm_testz_si128(w0, w0);
688}
689
690/*!
691 @brief check if wave of 2 pointers are the same (null or FULL)
692 @ingroup SSE4
693*/
695bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1) BMNOEXCEPT
696{
697 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
698 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
699 w0 = _mm_xor_si128(w0, w1);
700 return _mm_testz_si128(w0, w0);
701}
702
703
704/*!
705 SSE4.2 calculate number of bit changes from 0 to 1
706 @ingroup SSE4
707*/
708inline
709unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block,
710 unsigned size) BMNOEXCEPT
711{
713
714 const __m128i* block_end =
715 ( __m128i*)((bm::word_t*)(block) + size); // bm::set_block_size
716 __m128i m1COshft, m2COshft;
717
718 unsigned w0 = *((bm::word_t*)(block));
719 unsigned count = 1;
720
721 unsigned co2, co1 = 0;
722 for (;block < block_end; block += 2)
723 {
724 __m128i m1A = _mm_load_si128(block);
725 __m128i m2A = _mm_load_si128(block+1);
726
727 __m128i m1CO = _mm_srli_epi32(m1A, 31);
728 __m128i m2CO = _mm_srli_epi32(m2A, 31);
729
730 co2 = _mm_extract_epi32(m1CO, 3);
731
732 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
733 __m128i m2As = _mm_slli_epi32(m2A, 1);
734
735 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
736 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
737
738 co1 = co2;
739
740 co2 = _mm_extract_epi32(m2CO, 3);
741
742 m2COshft = _mm_slli_si128 (m2CO, 4);
743 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
744
745 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
746 m2As = _mm_or_si128(m2As, m2COshft);
747
748 co1 = co2;
749
750 // we now have two shifted SSE4 regs with carry-over
751 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
752 m2A = _mm_xor_si128(m2A, m2As);
753
754#ifdef BM64_SSE4
755 _mm_store_si128((__m128i*)tcnt, m1A);
756 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
757 _mm_store_si128((__m128i*)tcnt, m2A);
758 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
759#else
760 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
761 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
762 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
763 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
764 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
765 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
766
767 m0 = _mm_extract_epi32(m2A, 0);
768 m1 = _mm_extract_epi32(m2A, 1);
769 m2 = _mm_extract_epi32(m2A, 2);
770 m3 = _mm_extract_epi32(m2A, 3);
771 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
772 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
773#endif
774
775 }
776 count -= (w0 & 1u); // correct initial carry-in error
777 return count;
778}
779
780
781/*!
782 SSE4.2 calculate number of bit changes from 0 to 1 of a XOR product
783 @ingroup SSE4
784*/
785inline
787 const __m128i* BMRESTRICT xor_block,
788 unsigned size,
789 unsigned* BMRESTRICT gc,
790 unsigned* BMRESTRICT bc) BMNOEXCEPT
791{
792#ifdef BM64_SSE4
795#endif
796
797 const __m128i* block_end =
798 ( __m128i*)((bm::word_t*)(block) + size);
799 __m128i m1COshft, m2COshft;
800
801 unsigned w0 = *((bm::word_t*)(block));
802 unsigned gap_count = 1;
803 unsigned bit_count = 0;
804
805 unsigned co2, co1 = 0;
806 for (;block < block_end; block += 2, xor_block += 2)
807 {
808 __m128i m1A = _mm_load_si128(block);
809 __m128i m2A = _mm_load_si128(block+1);
810 __m128i m1B = _mm_load_si128(xor_block);
811 __m128i m2B = _mm_load_si128(xor_block+1);
812
813 m1A = _mm_xor_si128(m1A, m1B);
814 m2A = _mm_xor_si128(m2A, m2B);
815
816 {
817#ifdef BM64_SSE4
818 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
819 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
820 bit_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
821 bit_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
822#else
823 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
824 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
825 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
826 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
827 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
828 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
829
830 m0 = _mm_extract_epi32(m2A, 0);
831 m1 = _mm_extract_epi32(m2A, 1);
832 m2 = _mm_extract_epi32(m2A, 2);
833 m3 = _mm_extract_epi32(m2A, 3);
834 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
835 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
836#endif
837 }
838
839 __m128i m1CO = _mm_srli_epi32(m1A, 31);
840 __m128i m2CO = _mm_srli_epi32(m2A, 31);
841
842 co2 = _mm_extract_epi32(m1CO, 3);
843
844 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
845 __m128i m2As = _mm_slli_epi32(m2A, 1);
846
847 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
848 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
849
850 co1 = co2;
851
852 co2 = _mm_extract_epi32(m2CO, 3);
853
854 m2COshft = _mm_slli_si128 (m2CO, 4);
855 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
856
857 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
858 m2As = _mm_or_si128(m2As, m2COshft);
859
860 co1 = co2;
861
862 // we now have two shifted SSE4 regs with carry-over
863 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
864 m2A = _mm_xor_si128(m2A, m2As);
865
866#ifdef BM64_SSE4
867 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
868 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
869 gap_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
870 gap_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
871#else
872 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
873 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
874 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
875 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
876 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
877 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
878
879 m0 = _mm_extract_epi32(m2A, 0);
880 m1 = _mm_extract_epi32(m2A, 1);
881 m2 = _mm_extract_epi32(m2A, 2);
882 m3 = _mm_extract_epi32(m2A, 3);
883 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
884 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
885#endif
886
887 }
888 gap_count -= (w0 & 1u); // correct initial carry-in error
889 if (!gap_count)
890 ++gap_count; // must be >0
891 *gc = gap_count;
892 *bc = bit_count;
893}
894
895
896
897#ifdef BM64_SSE4
898
899/*!
900 SSE4.2 calculate number of bit changes from 0 to 1
901 @ingroup SSE4
902*/
903inline
905 unsigned* gc, unsigned* bc) BMNOEXCEPT
906{
907 const __m128i* block_end =
908 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
909 __m128i m1COshft, m2COshft;
910
911 unsigned w0 = *((bm::word_t*)(block));
912 unsigned bit_count = 0;
913 unsigned gap_count = 1;
914
915 unsigned co2, co1 = 0;
916 for (;block < block_end; block += 2)
917 {
918 __m128i m1A = _mm_load_si128(block);
919 __m128i m2A = _mm_load_si128(block+1);
920 {
921 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
922 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
923 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
924 m0 = _mm_extract_epi64(m2A, 0);
925 m1 = _mm_extract_epi64(m2A, 1);
926 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
927 }
928
929 __m128i m1CO = _mm_srli_epi32(m1A, 31);
930 __m128i m2CO = _mm_srli_epi32(m2A, 31);
931
932 co2 = _mm_extract_epi32(m1CO, 3);
933
934 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
935 __m128i m2As = _mm_slli_epi32(m2A, 1);
936
937 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
938 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
939
940 co1 = co2;
941
942 co2 = _mm_extract_epi32(m2CO, 3);
943
944 m2COshft = _mm_slli_si128 (m2CO, 4);
945 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
946
947 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
948 m2As = _mm_or_si128(m2As, m2COshft);
949
950 co1 = co2;
951
952 // we now have two shifted SSE4 regs with carry-over
953 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
954 m2A = _mm_xor_si128(m2A, m2As);
955 {
956 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
957 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
958 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
959 }
960
961 bm::id64_t m0 = _mm_extract_epi64(m2A, 0);
962 bm::id64_t m1 = _mm_extract_epi64(m2A, 1);
963 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
964
965 }
966 gap_count -= (w0 & 1u); // correct initial carry-in error
967 *gc = gap_count;
968 *bc = bit_count;
969}
970
971#endif
972
973
974/*!
975 \brief Find first bit which is different between two bit-blocks
976 @ingroup SSE4
977*/
978inline
979bool sse42_bit_find_first_diff(const __m128i* BMRESTRICT block1,
980 const __m128i* BMRESTRICT block2,
981 unsigned* pos) BMNOEXCEPT
982{
983 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
984
985 const __m128i* block1_end =
986 (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
987 const __m128i maskZ = _mm_setzero_si128();
988 __m128i mA, mB;
989 unsigned simd_lane = 0;
990 do
991 {
992 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
993 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
994 __m128i mOR = _mm_or_si128(mA, mB);
995 if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
996 {
997 if (!_mm_test_all_zeros(mA, mA))
998 {
999 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1000 mask = ~~mask; // invert to find (w != 0)
1001 BM_ASSERT(mask);
1002 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1003 _mm_store_si128 ((__m128i*)simd_buf, mA);
1004 unsigned widx = bsf >> 2; // (bsf / 4);
1005 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
1006 bsf = BM_BSF32(w); // find first bit != 0
1007 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1008 return true;
1009 }
1010 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1011 mask = ~~mask; // invert to find (w != 0)
1012 BM_ASSERT(mask);
1013 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1014 _mm_store_si128 ((__m128i*)simd_buf, mB);
1015 unsigned widx = bsf >> 2; // (bsf / 4);
1016 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
1017 bsf = BM_BSF32(w); // find first bit != 0
1018 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1019 return true;
1020 }
1021
1022 simd_lane+=2;
1023 block1+=2; block2+=2;
1024
1025 } while (block1 < block1_end);
1026 return false;
1027}
1028
1029
1030/*!
1031 \brief Find first non-zero bit
1032 @ingroup SSE4
1033*/
1034inline
1035bool sse42_bit_find_first(const __m128i* BMRESTRICT block,
1036 unsigned* pos) BMNOEXCEPT
1037{
1038 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
1039
1040 const __m128i* block_end =
1041 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1042 const __m128i maskZ = _mm_setzero_si128();
1043 __m128i mA, mB;
1044 unsigned simd_lane = 0;
1045 do
1046 {
1047 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
1048 __m128i mOR = _mm_or_si128(mA, mB);
1049 if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
1050 {
1051 if (!_mm_test_all_zeros(mA, mA))
1052 {
1053 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1054 mask = ~~mask; // invert to find (w != 0)
1055 BM_ASSERT(mask);
1056 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1057 _mm_store_si128 ((__m128i*)simd_buf, mA);
1058 unsigned widx = bsf >> 2; // (bsf / 4);
1059 unsigned w = simd_buf[widx];
1060 bsf = BM_BSF32(w); // find first bit != 0
1061 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1062 return true;
1063 }
1064 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1065 mask = ~~mask; // invert to find (w != 0)
1066 BM_ASSERT(mask);
1067 int bsf = BM_BSF32(mask); // find first !=0 (could use lzcnt())
1068 _mm_store_si128 ((__m128i*)simd_buf, mB);
1069 unsigned widx = bsf >> 2; // (bsf / 4);
1070 unsigned w = simd_buf[widx];
1071 bsf = BM_BSF32(w); // find first bit != 0
1072 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1073 return true;
1074 }
1075
1076 simd_lane+=2;
1077 block+=2;
1078
1079 } while (block < block_end);
1080 return false;
1081}
1082
1083
1084
1085
1086#ifdef __GNUG__
1087// necessary measure to silence false warning from GCC about negative pointer arithmetics
1088#pragma GCC diagnostic push
1089#pragma GCC diagnostic ignored "-Warray-bounds"
1090#endif
1091
1092/*!
1093 SSE4.2 check for one to two (variable len) 128 bit SSE lines
1094 for gap search results (8 elements)
1095 @ingroup SSE4
1096 \internal
1097*/
1098inline
1100 const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
1101{
1102 BM_ASSERT(size <= 16);
1103 BM_ASSERT(size);
1104
1105 const unsigned unroll_factor = 8;
1106 if (size < 4) // for very short vector use conventional scan
1107 {
1108 unsigned j;
1109 for (j = 0; j < size; ++j)
1110 {
1111 if (pbuf[j] >= pos)
1112 break;
1113 }
1114 return j;
1115 }
1116
1117 __m128i m1, mz, maskF, maskFL;
1118
1119 mz = _mm_setzero_si128();
1120 m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1121
1122 maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
1123 maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
1124 int shiftL= (64 - (unroll_factor - size) * 16);
1125 maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1126
1127 m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1128 m1 = _mm_or_si128(m1, maskFL);
1129
1130 __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1131 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1132 __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
1133 int mi = _mm_movemask_epi8(c_mask); // collect flag bits
1134 if (unsigned bc = _mm_popcnt_u32(mi)) // gives us number of elements >= pos
1135 return unroll_factor - bc; // address of first one element (target)
1136 // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1137 // GCC gives a false warning for "- unroll_factor" here
1138 const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1139 BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
1140
1141 m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1142 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1143 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1144 unsigned bc = _mm_popcnt_u32(mi);
1145
1146 return size - bc;
1147}
1148
1149/**
1150 Hybrid binary search, starts as binary, then switches to linear scan
1151
1152 \param buf - GAP buffer pointer.
1153 \param pos - index of the element.
1154 \param is_set - output. GAP value (0 or 1).
1155 \return GAP index.
1156
1157 @ingroup SSE4
1158*/
1159inline
1160unsigned sse42_gap_bfind(const unsigned short* BMRESTRICT buf,
1161 unsigned pos, unsigned* BMRESTRICT is_set) BMNOEXCEPT
1162{
1163 unsigned start = 1;
1164 unsigned end = 1 + ((*buf) >> 3);
1165 unsigned dsize = end - start;
1166
1167 if (dsize < 17)
1168 {
1169 start = bm::sse4_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
1170 *is_set = ((*buf) & 1) ^ (start & 1);
1171 BM_ASSERT(buf[start+1] >= pos);
1172 BM_ASSERT(buf[start] < pos || (start==0));
1173
1174 return start+1;
1175 }
1176 unsigned arr_end = end;
1177 while (start != end)
1178 {
1179 unsigned curr = (start + end) >> 1;
1180 if (buf[curr] < pos)
1181 start = curr + 1;
1182 else
1183 end = curr;
1184
1185 unsigned size = end - start;
1186 if (size < 16)
1187 {
1188 size += (end != arr_end);
1189 unsigned idx =
1190 bm::sse4_gap_find(buf + start, (bm::gap_word_t)pos, size);
1191 start += idx;
1192
1193 BM_ASSERT(buf[start] >= pos);
1194 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1195 break;
1196 }
1197 }
1198
1199 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1200 return start;
1201}
1202
1203/**
1204 Hybrid binary search, starts as binary, then switches to scan
1205 @ingroup SSE4
1206*/
1207inline
1208unsigned sse42_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) BMNOEXCEPT
1209{
1210 unsigned is_set;
1211 bm::sse42_gap_bfind(buf, pos, &is_set);
1212 return is_set;
1213}
1214
1215
1216
1217/**
1218 Experimental (test) function to do SIMD vector search (lower bound)
1219 in sorted, growing array
1220 @ingroup SSE4
1221
1222 \internal
1223*/
1224inline
1225int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
1226{
1227 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1228 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1229 //
1230 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1231 __m128i mm_val = _mm_set1_epi32(value);
1232
1233 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
1234 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
1235
1236 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1237 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1238
1239 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1240 int mask = _mm_movemask_epi8(cmp_mask_ge);
1241 if (mask)
1242 {
1243 int bsf = BM_BSF32(mask);//_bit_scan_forward(mask);
1244 return bsf / 4;
1245 }
1246 return -1;
1247}
1248
1249
1250
1251/*!
1252 SSE4.2 index lookup to check what belongs to the same block (8 elements)
1253 \internal
1254*/
1255inline
1256unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
1257 unsigned nb, unsigned start) BMNOEXCEPT
1258{
1259 const unsigned unroll_factor = 8;
1260 const unsigned len = (size - start);
1261 const unsigned len_unr = len - (len % unroll_factor);
1262 unsigned k;
1263
1264 idx += start;
1265
1266 __m128i nbM = _mm_set1_epi32(nb);
1267
1268 for (k = 0; k < len_unr; k+=unroll_factor)
1269 {
1270 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1271 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1272 __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
1273 __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
1274
1275 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1276 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1277 break;
1278
1279 } // for k
1280 for (; k < len; ++k)
1281 {
1282 if (nb != unsigned(idx[k] >> bm::set_block_shift))
1283 break;
1284 }
1285 return start + k;
1286}
1287
1288/*!
1289 SSE4.2 bulk bit set
1290 \internal
1291*/
1292inline
1294 const unsigned* BMRESTRICT idx,
1295 unsigned start, unsigned stop ) BMNOEXCEPT
1296{
1297 const unsigned unroll_factor = 4;
1298 const unsigned len = (stop - start);
1299 const unsigned len_unr = len - (len % unroll_factor);
1300
1301 idx += start;
1302
1303 unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1304 unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1305
1306 __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1307 __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1308
1309 unsigned k = 0;
1310 for (; k < len_unr; k+=unroll_factor)
1311 {
1312 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1313 __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
1314 __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1315
1316
1317 nbitA = _mm_and_si128 (nbitA, sw_mask);
1318 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1319
1320 // check-compare if all 4 bits are in the very same word
1321 //
1322 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
1323 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
1324 if (_mm_test_all_ones(cmpA)) // check if all are in one word
1325 {
1326 unsigned nword = _mm_extract_epi32(nwordA, 0);
1327 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1328 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1329 }
1330 else // bits are in different words, use scalar scatter
1331 {
1332 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1333
1334 block[mword_v[0]] |= (1u << mshift_v[0]);
1335 block[mword_v[1]] |= (1u << mshift_v[1]);
1336 block[mword_v[2]] |= (1u << mshift_v[2]);
1337 block[mword_v[3]] |= (1u << mshift_v[3]);
1338 }
1339
1340 } // for k
1341
1342 for (; k < len; ++k)
1343 {
1344 unsigned n = idx[k];
1345 unsigned nbit = unsigned(n & bm::set_block_mask);
1346 unsigned nword = nbit >> bm::set_word_shift;
1347 nbit &= bm::set_word_mask;
1348 block[nword] |= (1u << nbit);
1349 } // for k
1350}
1351
1352
1353/*!
1354 SSE4.2 bit block gather-scatter
1355
1356 @param arr - destination array to set bits
1357 @param blk - source bit-block
1358 @param idx - gather index array
1359 @param size - gather array size
1360 @param start - gaher start index
1361 @param bit_idx - bit to set in the target array
1362
1363 \internal
1364
1365 C algorithm:
1366
1367 for (unsigned k = start; k < size; ++k)
1368 {
1369 nbit = unsigned(idx[k] & bm::set_block_mask);
1370 nword = unsigned(nbit >> bm::set_word_shift);
1371 mask0 = 1u << (nbit & bm::set_word_mask);
1372 arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
1373 }
1374
1375*/
1376inline
1378 const unsigned* BMRESTRICT blk,
1379 const unsigned* BMRESTRICT idx,
1380 unsigned size,
1381 unsigned start,
1382 unsigned bit_idx) BMNOEXCEPT
1383{
1384 const unsigned unroll_factor = 4;
1385 const unsigned len = (size - start);
1386 const unsigned len_unr = len - (len % unroll_factor);
1387
1388 __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1389 __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1390 __m128i maskFF = _mm_set1_epi32(~0u);
1391 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1392
1393 __m128i mask_tmp, mask_0;
1394
1395 unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1396 unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1397
1398 unsigned k = 0;
1399 unsigned base = start + k;
1400 __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
1401 __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
1402 for (; k < len_unr; k+=unroll_factor)
1403 {
1404 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
1405 __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1406 // (nbit & bm::set_word_mask)
1407 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1408 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1409
1410 // mask0 = 1u << (nbit & bm::set_word_mask);
1411 //
1412#if 0
1413 // ifdefed an alternative SHIFT implementation using SSE and masks
1414 // (it is not faster than just doing scalar operations)
1415 {
1416 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1417 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1418 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1419 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1420 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1421
1422 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1423 _mm_slli_si128 (am_0, 8));
1424 mask_tmp = _mm_and_si128 (
1425 _mm_slli_epi32(mask1, mshift_v[3]),
1426 _mm_slli_si128 (am_0, 12)
1427 );
1428
1429 mask_0 = _mm_or_si128 (mask_0,
1430 _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1431 }
1432#endif
1433 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1434
1435
1436 // gather for: blk[nword] (.. & mask0 )
1437 //
1438 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1439 blk[mword_v[1]], blk[mword_v[0]]),
1440 mask_0);
1441
1442 // bool(blk[nword] ...)
1443 //maskFF = _mm_set1_epi32(~0u);
1444 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1445 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1446 mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1447
1448 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1449
1450 _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1451 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1452
1453 ++idx_ptr; ++target_ptr;
1454 _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1455 }
1456
1457 for (; k < len; ++k)
1458 {
1459 base = start + k;
1460 unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1461 arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1462 }
1463
1464}
1465
1466/*!
1467 @brief block shift left by 1
1468 @ingroup SSE4
1469*/
1470inline
1471bool sse42_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1472{
1473 __m128i* block_end =
1474 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1475 __m128i mAcc = _mm_set1_epi32(0);
1476 __m128i mMask1 = _mm_set1_epi32(1);
1477
1478 unsigned co2;
1479 for (--block_end; block_end >= block; block_end -= 2)
1480 {
1481 __m128i m1A = _mm_load_si128(block_end);
1482 __m128i m2A = _mm_load_si128(block_end-1);
1483
1484 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1485 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1486
1487 co2 = _mm_extract_epi32(m1CO, 0);
1488
1489 m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1490 m2A = _mm_srli_epi32(m2A, 1);
1491
1492 __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1493 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1494 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1495 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1496 m1COshft = _mm_slli_epi32(m1COshft, 31);
1497 m2COshft = _mm_slli_epi32(m2COshft, 31);
1498
1499 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1500 m2A = _mm_or_si128(m2A, m2COshft);
1501
1502 co1 = _mm_extract_epi32(m2CO, 0);
1503
1504 _mm_store_si128(block_end, m1A);
1505 _mm_store_si128(block_end-1, m2A);
1506
1507 mAcc = _mm_or_si128(mAcc, m1A);
1508 mAcc = _mm_or_si128(mAcc, m2A);
1509 } // for
1510
1511 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1512 return co1;
1513}
1514
1515
1516/*!
1517 @brief block shift right by 1
1518 @ingroup SSE4
1519*/
1520inline
1521bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1522{
1523 __m128i* block_end =
1524 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1525 __m128i m1COshft, m2COshft;
1526 __m128i mAcc = _mm_set1_epi32(0);
1527
1528 unsigned co2;
1529 for (;block < block_end; block += 2)
1530 {
1531 __m128i m1A = _mm_load_si128(block);
1532 __m128i m2A = _mm_load_si128(block+1);
1533
1534 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1535 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1536
1537 co2 = _mm_extract_epi32(m1CO, 3);
1538
1539 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1540 m2A = _mm_slli_epi32(m2A, 1);
1541
1542 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1543 m2COshft = _mm_slli_si128 (m2CO, 4);
1544 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1545 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1546
1547 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1548 m2A = _mm_or_si128(m2A, m2COshft);
1549
1550 co1 = _mm_extract_epi32(m2CO, 3);
1551
1552 _mm_store_si128(block, m1A);
1553 _mm_store_si128(block+1, m2A);
1554
1555 mAcc = _mm_or_si128(mAcc, m1A);
1556 mAcc = _mm_or_si128(mAcc, m2A);
1557 }
1558 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1559 return co1;
1560}
1561
1562
1563
1564/*!
1565 @brief block shift right by 1 plus AND
1566
1567 @return carry over flag
1568 @ingroup SSE4
1569*/
1570inline
1571bool sse42_shift_r1_and(__m128i* block,
1572 bm::word_t co1,
1573 const __m128i* BMRESTRICT mask_block,
1574 bm::id64_t* digest) BMNOEXCEPT
1575{
1576 bm::word_t* wblock = (bm::word_t*) block;
1577 const bm::word_t* mblock = (const bm::word_t*) mask_block;
1578
1579 __m128i m1COshft, m2COshft;
1580 __m128i mAcc = _mm_set1_epi32(0);
1581 unsigned co2;
1582
1583 bm::id64_t d, wd;
1584 wd = d = *digest;
1585
1586 unsigned di = 0;
1587 if (!co1)
1588 {
1589 bm::id64_t t = d & -d;
1590#ifdef BM64_SSE4
1591 di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1592#else
1593 bm::id_t t32 = t & bm::id_max;
1594 if (t32 == 0) {
1595 di = 32;
1596 t32 = t >> 32;
1597 }
1598 di += unsigned(_mm_popcnt_u32(t32 - 1));
1599#endif
1600 }
1601
1602 for (; di < 64 ; ++di)
1603 {
1604 const unsigned d_base = di * bm::set_block_digest_wave_size;
1605 bm::id64_t dmask = (1ull << di);
1606 if (d & dmask) // digest stride NOT empty
1607 {
1608 block = (__m128i*) &wblock[d_base];
1609 mask_block = (__m128i*) &mblock[d_base];
1610 mAcc = _mm_xor_si128(mAcc, mAcc); // mAcc = 0
1611 for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1612 {
1613 __m128i m1A = _mm_load_si128(block);
1614 __m128i m2A = _mm_load_si128(block+1);
1615
1616 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1617 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1618
1619 co2 = _mm_extract_epi32(m1CO, 3);
1620
1621 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1622 m2A = _mm_slli_epi32(m2A, 1);
1623
1624 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1625 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1626
1627 co1 = co2;
1628
1629 co2 = _mm_extract_epi32(m2CO, 3);
1630
1631 m2COshft = _mm_slli_si128 (m2CO, 4);
1632 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1633
1634 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1635 m2A = _mm_or_si128(m2A, m2COshft);
1636
1637 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1638 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1639
1640 mAcc = _mm_or_si128(mAcc, m1A);
1641 mAcc = _mm_or_si128(mAcc, m2A);
1642
1643 _mm_store_si128(block, m1A);
1644 _mm_store_si128(block+1, m2A);
1645
1646 co1 = co2;
1647
1648 } // for i
1649
1650 if (_mm_testz_si128(mAcc, mAcc))
1651 d &= ~~dmask; // clear digest bit
1652 wd &= wd - 1;
1653 }
1654 else
1655 {
1656 if (co1)
1657 {
1658 BM_ASSERT(co1 == 1);
1659 BM_ASSERT(wblock[d_base] == 0);
1660
1661 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1662 d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1663 co1 = 0;
1664 }
1665 if (!wd) // digest is empty, no CO -> exit
1666 break;
1667 }
1668 } // for di
1669
1670 *digest = d;
1671 return co1;
1672}
1673
1674/**
1675 Build partial XOR product of 2 bit-blocks using digest mask
1676
1677 @param target_block - target := block ^ xor_block
1678 @param block - arg1
1679 @param xor_block - arg2
1680 @param digest - mask for each block wave to XOR (1) or just copy (0)
1681
1682 @ingroup SSE4
1683 @internal
1684*/
1685inline
1687 const bm::word_t* block, const bm::word_t* xor_block,
1688 bm::id64_t digest) BMNOEXCEPT
1689{
1690 for (unsigned i = 0; i < bm::block_waves; ++i)
1691 {
1692 const bm::id64_t mask = (1ull << i);
1693 unsigned off = (i * bm::set_block_digest_wave_size);
1694 const __m128i* sub_block = (__m128i*) (block + off);
1695 __m128i* t_sub_block = (__m128i*)(target_block + off);
1696
1697 if (digest & mask) // XOR filtered sub-block
1698 {
1699 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
1700 __m128i mA, mB, mC, mD;
1701 mA = _mm_xor_si128(_mm_load_si128(sub_block),
1702 _mm_load_si128(xor_sub_block));
1703 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1704 _mm_load_si128(xor_sub_block+1));
1705 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1706 _mm_load_si128(xor_sub_block+2));
1707 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1708 _mm_load_si128(xor_sub_block+3));
1709
1710 _mm_store_si128(t_sub_block, mA);
1711 _mm_store_si128(t_sub_block+1, mB);
1712 _mm_store_si128(t_sub_block+2, mC);
1713 _mm_store_si128(t_sub_block+3, mD);
1714
1715 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1716 _mm_load_si128(xor_sub_block+4));
1717 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1718 _mm_load_si128(xor_sub_block+5));
1719 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1720 _mm_load_si128(xor_sub_block+6));
1721 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1722 _mm_load_si128(xor_sub_block+7));
1723
1724 _mm_store_si128(t_sub_block+4, mA);
1725 _mm_store_si128(t_sub_block+5, mB);
1726 _mm_store_si128(t_sub_block+6, mC);
1727 _mm_store_si128(t_sub_block+7, mD);
1728
1729 }
1730 else // just copy source
1731 {
1732 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
1733 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
1734 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
1735 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
1736
1737 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
1738 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
1739 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
1740 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
1741 }
1742 } // for i
1743}
1744
1745/**
1746 Build partial XOR product of 2 bit-blocks using digest mask
1747
1748 @param target_block - target ^= xor_block
1749 @param xor_block - arg1
1750 @param digest - mask for each block wave to XOR (if 1)
1751
1752 @ingroup SSE4
1753 @internal
1754*/
1755inline
1757 const bm::word_t* xor_block,
1758 bm::id64_t digest) BMNOEXCEPT
1759{
1760 while (digest)
1761 {
1762 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
1763 unsigned wave = unsigned(_mm_popcnt_u64(t - 1));
1764 unsigned off = wave * bm::set_block_digest_wave_size;
1765
1766 const __m128i* sub_block = (const __m128i*) (xor_block + off);
1767 __m128i* t_sub_block = (__m128i*)(target_block + off);
1768
1769 __m128i mA, mB, mC, mD;
1770 mA = _mm_xor_si128(_mm_load_si128(sub_block),
1771 _mm_load_si128(t_sub_block));
1772 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1773 _mm_load_si128(t_sub_block+1));
1774 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1775 _mm_load_si128(t_sub_block+2));
1776 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1777 _mm_load_si128(t_sub_block+3));
1778
1779 _mm_store_si128(t_sub_block, mA);
1780 _mm_store_si128(t_sub_block+1, mB);
1781 _mm_store_si128(t_sub_block+2, mC);
1782 _mm_store_si128(t_sub_block+3, mD);
1783
1784 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1785 _mm_load_si128(t_sub_block+4));
1786 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1787 _mm_load_si128(t_sub_block+5));
1788 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1789 _mm_load_si128(t_sub_block+6));
1790 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1791 _mm_load_si128(t_sub_block+7));
1792
1793 _mm_store_si128(t_sub_block+4, mA);
1794 _mm_store_si128(t_sub_block+5, mB);
1795 _mm_store_si128(t_sub_block+6, mC);
1796 _mm_store_si128(t_sub_block+7, mD);
1797
1798 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
1799 } // while
1800}
1801
1802
1803
1804#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1805 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1806
1807#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1808 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1809
1810#define VECT_BITCOUNT(first, last) \
1811 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1812
1813#define VECT_BITCOUNT_AND(first, last, mask) \
1814 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1815
1816#define VECT_BITCOUNT_OR(first, last, mask) \
1817 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1818
1819#define VECT_BITCOUNT_XOR(first, last, mask) \
1820 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1821
1822#define VECT_BITCOUNT_SUB(first, last, mask) \
1823 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1824
1825#define VECT_INVERT_BLOCK(first) \
1826 sse2_invert_block((__m128i*)first);
1827
1828#define VECT_AND_BLOCK(dst, src) \
1829 sse4_and_block((__m128i*) dst, (__m128i*) (src))
1830
1831#define VECT_AND_DIGEST(dst, src) \
1832 sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1833
1834#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1835 sse4_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1836
1837#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1838 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1839
1840#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1841 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1842
1843#define VECT_OR_BLOCK(dst, src) \
1844 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1845
1846#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1847 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1848
1849#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1850 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1851
1852#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1853 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1854
1855#define VECT_SUB_BLOCK(dst, src) \
1856 sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1857
1858#define VECT_SUB_DIGEST(dst, src) \
1859 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1860
1861#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1862 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1863
1864#define VECT_XOR_BLOCK(dst, src) \
1865 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1866
1867#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1868 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1869
1870#define VECT_COPY_BLOCK(dst, src) \
1871 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1872
1873#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1874 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1875
1876#define VECT_STREAM_BLOCK(dst, src) \
1877 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1878
1879#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1880 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1881
1882#define VECT_SET_BLOCK(dst, value) \
1883 sse2_set_block((__m128i*) dst, value)
1884
1885#define VECT_IS_ZERO_BLOCK(dst) \
1886 sse4_is_all_zero((__m128i*) dst)
1887
1888#define VECT_IS_ONE_BLOCK(dst) \
1889 sse4_is_all_one((__m128i*) dst)
1890
1891#define VECT_IS_DIGEST_ZERO(start) \
1892 sse4_is_digest_zero((__m128i*)start)
1893
1894#define VECT_BLOCK_SET_DIGEST(dst, val) \
1895 sse4_block_set_digest((__m128i*)dst, val)
1896
1897#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1898 sse2_lower_bound_scan_u32(arr, target, from, to)
1899
1900#define VECT_SHIFT_L1(b, acc, co) \
1901 sse42_shift_l1((__m128i*)b, acc, co)
1902
1903#define VECT_SHIFT_R1(b, acc, co) \
1904 sse42_shift_r1((__m128i*)b, acc, co)
1905
1906#define VECT_SHIFT_R1_AND(b, co, m, digest) \
1907 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1908
1909#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1910 sse42_idx_arr_block_lookup(idx, size, nb, start)
1911
1912#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1913 sse42_set_block_bits(block, idx, start, stop)
1914
1915#define VECT_BLOCK_CHANGE(block, size) \
1916 sse42_bit_block_calc_change((__m128i*)block, size)
1917
1918#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
1919 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size, gc, bc)
1920
1921#ifdef BM64_SSE4
1922#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
1923 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
1924#endif
1925
1926#define VECT_BIT_FIND_FIRST(src, pos) \
1927 sse42_bit_find_first((__m128i*) src, pos)
1928
1929#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1930 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1931
1932#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1933 sse42_bit_block_xor(t, src, src_xor, d)
1934
1935#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1936 sse42_bit_block_xor_2way(t, src_xor, d)
1937
1938
1939#define VECT_GAP_BFIND(buf, pos, is_set) \
1940 sse42_gap_bfind(buf, pos, is_set)
1941
1942#ifdef __GNUG__
1943#pragma GCC diagnostic pop
1944#endif
1945
1946
1947// undefine local defines to avoid pre-proc space pollution
1948//
1949#undef BM_BSF32
1950
1951#ifdef _MSC_VER
1952#pragma warning( pop )
1953#endif
1954
1955} // namespace
1956
1957
1958
1959
1960#endif
Definitions(internal)
#define BM_ALIGN16
Definition: bmdef.h:287
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN32
Definition: bmdef.h:292
#define BM_ALIGN16ATTR
Definition: bmdef.h:288
#define BMFORCEINLINE
Definition: bmdef.h:213
#define BM_ASSERT
Definition: bmdef.h:139
#define BM_ALIGN32ATTR
Definition: bmdef.h:293
#define BM_BSF32
Definition: bmsse4.h:66
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if wave of 2 pointers are the same (null or FULL)
Definition: bmsse4.h:695
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
Definition: bmsse4.h:1471
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr) BMNOEXCEPT
check if wave of pointers is all NULL
Definition: bmsse4.h:671
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block, unsigned size) BMNOEXCEPT
Definition: bmsse4.h:709
bool sse42_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
Definition: bmsse4.h:979
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
Definition: bmsse4.h:1521
void sse42_bit_block_calc_xor_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
Definition: bmsse4.h:786
int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition: bmsse4.h:1225
BMFORCEINLINE bool sse4_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
Definition: bmsse4.h:220
BMFORCEINLINE bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
Definition: bmsse4.h:304
bool sse4_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
Definition: bmsse4.h:195
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end) BMNOEXCEPT
Definition: bmsse4.h:93
BMFORCEINLINE bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:352
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse4.h:1686
bool sse4_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
Definition: bmsse4.h:401
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
Definition: bmsse4.h:257
BMFORCEINLINE bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse4.h:587
BMFORCEINLINE bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:538
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
Definition: bmsse4.h:455
unsigned sse42_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos) BMNOEXCEPT
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmsse4.h:1208
unsigned sse42_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set) BMNOEXCEPT
Hybrid binary search, starts as binary, then switches to linear scan.
Definition: bmsse4.h:1160
bool sse42_bit_find_first(const __m128i *BMRESTRICT block, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
Definition: bmsse4.h:1035
BMFORCEINLINE void sse4_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse4.h:239
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest) BMNOEXCEPT
block shift right by 1 plus AND
Definition: bmsse4.h:1571
bool sse4_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
Definition: bmsse4.h:635
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
Definition: bmsse4.h:1099
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if 2 waves of pointers are all NULL
Definition: bmsse4.h:682
void sse42_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse4.h:1756
BMFORCEINLINE bool sse42_test_all_one_wave(const void *ptr) BMNOEXCEPT
check if SSE wave is all oxFFFF...FFF
Definition: bmsse4.h:660
void sse42_bit_block_calc_change_bc(const __m128i *BMRESTRICT block, unsigned *gc, unsigned *bc) BMNOEXCEPT
Definition: bmsse4.h:904
Definition: bm.h:78
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
const unsigned id_max
Definition: bmconst.h:109
unsigned int word_t
Definition: bmconst.h:39
const unsigned set_block_mask
Definition: bmconst.h:57
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
Definition: bmsse4.h:136
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx) BMNOEXCEPT
Definition: bmsse4.h:1377
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func) BMNOEXCEPT
Definition: bmsse4.h:152
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
Definition: bmsse4.h:145
const unsigned set_word_shift
Definition: bmconst.h:72
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop) BMNOEXCEPT
Definition: bmsse4.h:1293
const unsigned set_block_size
Definition: bmconst.h:55
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start) BMNOEXCEPT
Definition: bmsse4.h:1256
unsigned int id_t
Definition: bmconst.h:38
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition: bmutil.h:335
unsigned short gap_word_t
Definition: bmconst.h:78
const unsigned set_block_shift
Definition: bmconst.h:56
const unsigned set_word_mask
Definition: bmconst.h:73
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT
Definition: bmsse4.h:126