BitMagic-C++
bmavx2.h
Go to the documentation of this file.
1#ifndef BMAVX2__H__INCLUDED__
2#define BMAVX2__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21// some of the algorithms here is based on modified libpopcnt library by Kim Walisch
22// https://github.com/kimwalisch/libpopcnt/
23//
24/*
25 * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
26 * population count) in an array as quickly as possible using
27 * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
28 *
29 * Copyright (c) 2016 - 2017, Kim Walisch
30 * Copyright (c) 2016 - 2017, Wojciech Muła
31 *
32 * All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions are met:
36 *
37 * 1. Redistributions of source code must retain the above copyright notice, this
38 * list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright notice,
40 * this list of conditions and the following disclaimer in the documentation
41 * and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
47 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
48 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
49 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55
56/** @defgroup AVX2 AVX2 functions
57 Processor specific optimizations for AVX2 instructions (internals)
58 @ingroup bvector
59 @internal
60 */
61
62
63// Header implements processor specific intrinsics declarations for AVX2
64// instruction set
65//
66#include<emmintrin.h>
67#include<immintrin.h>
68
69#include "bmdef.h"
70#include "bmbmi2.h"
71#include "bmutil.h"
72
73namespace bm
74{
75
76// debugging utils
77#if 0
78inline
79void avx2_print256_u32(const char* prefix, const __m256i & value)
80{
81 const size_t n = sizeof(__m256i) / sizeof(unsigned);
82 unsigned buffer[n];
83 _mm256_storeu_si256((__m256i*)buffer, value);
84 std::cout << prefix << " [ ";
85 for (int i = n-1; 1; --i)
86 {
87 std::cout << std::hex << buffer[i] << " ";
88 if (i == 0)
89 break;
90 }
91 std::cout << "]" << std::endl;
92}
93
94inline
95void avx2_print256_u16(const char* prefix, const __m256i & value)
96{
97 const size_t n = sizeof(__m256i) / sizeof(unsigned short);
98 unsigned short buffer[n];
99 _mm256_storeu_si256((__m256i*)buffer, value);
100 std::cout << prefix << " [ ";
101 for (int i = n-1; 1; --i)
102 {
103 std::cout << buffer[i] << " ";
104 if (i == 0)
105 break;
106 }
107 std::cout << "]" << std::endl;
108}
109#endif
110
111#ifdef __GNUG__
112#pragma GCC diagnostic push
113#pragma GCC diagnostic ignored "-Wconversion"
114#endif
115
116
117#define BM_CSA256(h, l, a, b, c) \
118{ \
119 __m256i u = _mm256_xor_si256(a, b); \
120 h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \
121 l = _mm256_xor_si256(u, c); \
122}
123
124#define BM_AVX2_BIT_COUNT(ret, v) \
125{ \
126 __m256i lo = _mm256_and_si256(v, low_mask); \
127 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \
128 __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \
129 __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \
130 ret = _mm256_sad_epu8(cnt1, cnt2); \
131}
132
133#define BM_AVX2_DECL_LOOKUP1 \
134 __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \
135 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
136#define BM_AVX2_DECL_LOOKUP2 \
137__m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \
138 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
139
140#define BM_AVX2_POPCNT_PROLOG \
141 BM_AVX2_DECL_LOOKUP1 \
142 BM_AVX2_DECL_LOOKUP2 \
143 __m256i low_mask = _mm256_set1_epi8(0x0f); \
144 __m256i bc;
145
146/*!
147 @brief AVX2 Harley-Seal popcount
148 The algorithm is based on the paper "Faster Population Counts
149 using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
150 Wojciech Mula (23 Nov 2016).
151 @see https://arxiv.org/abs/1611.07612
152
153 @ingroup AVX2
154*/
155inline
156bm::id_t avx2_bit_count(const __m256i* BMRESTRICT block,
157 const __m256i* BMRESTRICT block_end)
158{
159 __m256i cnt = _mm256_setzero_si256();
160 __m256i ones = _mm256_setzero_si256();
161 __m256i twos = _mm256_setzero_si256();
162 __m256i fours = _mm256_setzero_si256();
163 __m256i eights = _mm256_setzero_si256();
164 __m256i sixteens = _mm256_setzero_si256();
165 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
166 __m256i b, c;
167
169 bm::id64_t* cnt64;
170
171 do
172 {
173 b = _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
174 BM_CSA256(twosA, ones, ones, b, c);
175
176 b = _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
177 BM_CSA256(twosB, ones, ones, b, c);
178 BM_CSA256(foursA, twos, twos, twosA, twosB);
179
180 b = _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
181 BM_CSA256(twosA, ones, ones, b, c);
182
183 b = _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
184 BM_CSA256(twosB, ones, ones, b, c);
185 BM_CSA256(foursB, twos, twos, twosA, twosB);
186 BM_CSA256(eightsA, fours, fours, foursA, foursB);
187
188 b = _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
189 BM_CSA256(twosA, ones, ones, b, c);
190
191 b = _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
192 BM_CSA256(twosB, ones, ones, b, c);
193 BM_CSA256(foursA, twos, twos, twosA, twosB);
194
195 b = _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
196 BM_CSA256(twosA, ones, ones, b, c);
197
198 b = _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
199 BM_CSA256(twosB, ones, ones, b, c);
200 BM_CSA256(foursB, twos, twos, twosA, twosB);
201 BM_CSA256(eightsB, fours, fours, foursA, foursB);
202 BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
203
204 BM_AVX2_BIT_COUNT(bc, sixteens);
205 cnt = _mm256_add_epi64(cnt, bc);
206
207 block += 16;
208 } while (block < block_end);
209
210 cnt = _mm256_slli_epi64(cnt, 4);
211 BM_AVX2_BIT_COUNT(bc, eights)
212 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 3));
213 BM_AVX2_BIT_COUNT(bc, fours);
214 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 2));
215 BM_AVX2_BIT_COUNT(bc, twos);
216 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 1));
217 BM_AVX2_BIT_COUNT(bc, ones);
218 cnt = _mm256_add_epi64(cnt, bc);
219
220 cnt64 = (bm::id64_t*) &cnt;
221
222 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
223}
224
225/*!
226 @brief Calculate population count based on digest
227
228 @return popcnt
229 @ingroup AVX2
230*/
231inline
233 bm::id64_t digest)
234{
235 bm::id_t count = 0;
236 bm::id64_t* cnt64;
238 __m256i cnt = _mm256_setzero_si256();
239 while (digest)
240 {
241 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
242
243 unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
244 unsigned off = wave * bm::set_block_digest_wave_size;
245
246 const __m256i* BMRESTRICT wave_src = (__m256i*)&block[off];
247
248 __m256i m1A, m1B, m1C, m1D;
249 m1A = _mm256_load_si256(wave_src);
250 m1B = _mm256_load_si256(wave_src+1);
251 if (!_mm256_testz_si256(m1A, m1A))
252 {
253 BM_AVX2_BIT_COUNT(bc, m1A)
254 cnt = _mm256_add_epi64(cnt, bc);
255 }
256 if (!_mm256_testz_si256(m1B, m1B))
257 {
258 BM_AVX2_BIT_COUNT(bc, m1B)
259 cnt = _mm256_add_epi64(cnt, bc);
260 }
261
262 m1C = _mm256_load_si256(wave_src+2);
263 m1D = _mm256_load_si256(wave_src+3);
264 if (!_mm256_testz_si256(m1C, m1C))
265 {
266 BM_AVX2_BIT_COUNT(bc, m1C)
267 cnt = _mm256_add_epi64(cnt, bc);
268 }
269 if (!_mm256_testz_si256(m1D, m1D))
270 {
271 BM_AVX2_BIT_COUNT(bc, m1D)
272 cnt = _mm256_add_epi64(cnt, bc);
273 }
274
275 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
276 } // while
277 cnt64 = (bm::id64_t*)&cnt;
278 count = (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
279 return count;
280
281}
282
283
284
285/*!
286 @brief AND bit count for two aligned bit-blocks
287 @ingroup AVX2
288*/
289inline
291 const __m256i* BMRESTRICT block_end,
292 const __m256i* BMRESTRICT mask_block)
293{
294 bm::id64_t* cnt64;
296 __m256i cnt = _mm256_setzero_si256();
297 __m256i ymm0, ymm1;
298
299
300 do
301 {
302 ymm0 = _mm256_load_si256(block);
303 ymm1 = _mm256_load_si256(mask_block);
304 ymm0 = _mm256_and_si256(ymm0, ymm1);
305 ++block; ++mask_block;
306 BM_AVX2_BIT_COUNT(bc, ymm0)
307 cnt = _mm256_add_epi64(cnt, bc);
308
309 ymm0 = _mm256_load_si256(block);
310 ymm1 = _mm256_load_si256(mask_block);
311 ymm0 = _mm256_and_si256(ymm0, ymm1);
312 ++block; ++mask_block;
313 BM_AVX2_BIT_COUNT(bc, ymm0)
314 cnt = _mm256_add_epi64(cnt, bc);
315
316 ymm0 = _mm256_load_si256(block);
317 ymm1 = _mm256_load_si256(mask_block);
318 ymm0 = _mm256_and_si256(ymm0, ymm1);
319 ++block; ++mask_block;
320 BM_AVX2_BIT_COUNT(bc, ymm0)
321 cnt = _mm256_add_epi64(cnt, bc);
322
323 ymm0 = _mm256_load_si256(block);
324 ymm1 = _mm256_load_si256(mask_block);
325 ymm0 = _mm256_and_si256(ymm0, ymm1);
326 ++block; ++mask_block;
327 BM_AVX2_BIT_COUNT(bc, ymm0)
328 cnt = _mm256_add_epi64(cnt, bc);
329
330 } while (block < block_end);
331
332 cnt64 = (bm::id64_t*)&cnt;
333 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
334}
335
336inline
338 const __m256i* BMRESTRICT block_end,
339 const __m256i* BMRESTRICT mask_block)
340{
341 bm::id64_t* cnt64;
343 __m256i cnt = _mm256_setzero_si256();
344 do
345 {
346 __m256i tmp0 = _mm256_load_si256(block);
347 __m256i tmp1 = _mm256_load_si256(mask_block);
348
349 tmp0 = _mm256_or_si256(tmp0, tmp1);
350
351 BM_AVX2_BIT_COUNT(bc, tmp0)
352 cnt = _mm256_add_epi64(cnt, bc);
353
354 ++block; ++mask_block;
355
356 } while (block < block_end);
357
358 cnt64 = (bm::id64_t*)&cnt;
359 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
360}
361
362
363/*!
364 @brief XOR bit count for two aligned bit-blocks
365 @ingroup AVX2
366*/
367inline
369 const __m256i* BMRESTRICT block_end,
370 const __m256i* BMRESTRICT mask_block)
371{
372 bm::id64_t* cnt64;
374 __m256i cnt = _mm256_setzero_si256();
375 __m256i mA, mB, mC, mD;
376 do
377 {
378 mA = _mm256_xor_si256(_mm256_load_si256(block+0),
379 _mm256_load_si256(mask_block+0));
380 BM_AVX2_BIT_COUNT(bc, mA)
381 cnt = _mm256_add_epi64(cnt, bc);
382
383 mB = _mm256_xor_si256(_mm256_load_si256(block+1),
384 _mm256_load_si256(mask_block+1));
385 BM_AVX2_BIT_COUNT(bc, mB);
386 cnt = _mm256_add_epi64(cnt, bc);
387
388 mC = _mm256_xor_si256(_mm256_load_si256(block+2),
389 _mm256_load_si256(mask_block+2));
390 BM_AVX2_BIT_COUNT(bc, mC);
391 cnt = _mm256_add_epi64(cnt, bc);
392
393 mD = _mm256_xor_si256(_mm256_load_si256(block+3),
394 _mm256_load_si256(mask_block+3));
395 BM_AVX2_BIT_COUNT(bc, mD);
396 cnt = _mm256_add_epi64(cnt, bc);
397
398 block += 4; mask_block += 4;
399
400 } while (block < block_end);
401
402 cnt64 = (bm::id64_t*)&cnt;
403 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
404}
405
406
407
408/*!
409 @brief AND NOT bit count for two aligned bit-blocks
410 @ingroup AVX2
411*/
412inline
414 const __m256i* BMRESTRICT block_end,
415 const __m256i* BMRESTRICT mask_block)
416{
417 bm::id64_t* cnt64;
419 __m256i cnt = _mm256_setzero_si256();
420 do
421 {
422 __m256i tmp0 = _mm256_load_si256(block);
423 __m256i tmp1 = _mm256_load_si256(mask_block);
424
425 tmp0 = _mm256_andnot_si256(tmp1, tmp0);
426
427 BM_AVX2_BIT_COUNT(bc, tmp0)
428 cnt = _mm256_add_epi64(cnt, bc);
429
430 ++block; ++mask_block;
431
432 } while (block < block_end);
433
434 cnt64 = (bm::id64_t*)&cnt;
435 return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
436}
437
438
439
440/*!
441 @brief XOR array elements to specified mask
442 *dst = *src ^ mask
443
444 @ingroup AVX2
445*/
446inline
448 const __m256i* BMRESTRICT src,
449 const __m256i* BMRESTRICT src_end,
450 bm::word_t mask)
451{
452 __m256i yM = _mm256_set1_epi32(int(mask));
453 do
454 {
455 _mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM)); // ymm1 = (~ymm1) & ymm2
456 _mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));
457 _mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));
458 _mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));
459
460 dst += 4; src += 4;
461 } while (src < src_end);
462}
463
464
465/*!
466 @brief Inverts array elements and NOT them to specified mask
467 *dst = ~*src & mask
468
469 @ingroup AVX2
470*/
471inline
473 const __m256i* BMRESTRICT src,
474 const __m256i* BMRESTRICT src_end,
475 bm::word_t mask)
476{
477 __m256i yM = _mm256_set1_epi32(int(mask));
478 do
479 {
480 _mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM)); // ymm1 = (~ymm1) & ymm2
481 _mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));
482 _mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));
483 _mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));
484
485 dst += 4; src += 4;
486 } while (src < src_end);
487}
488
489/*!
490 @brief AND array elements against another array
491 *dst &= *src
492 @return 0 if destination does not have any bits
493 @ingroup AVX2
494*/
495inline
496unsigned avx2_and_block(__m256i* BMRESTRICT dst,
497 const __m256i* BMRESTRICT src)
498{
499 __m256i m1A, m1B, m1C, m1D;
500 __m256i accA, accB, accC, accD;
501
502 const __m256i* BMRESTRICT src_end =
503 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
504
505 accA = accB = accC = accD = _mm256_setzero_si256();
506
507 do
508 {
509 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
510 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
511 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
512 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
513
514 _mm256_store_si256(dst+0, m1A);
515 _mm256_store_si256(dst+1, m1B);
516 _mm256_store_si256(dst+2, m1C);
517 _mm256_store_si256(dst+3, m1D);
518
519 accA = _mm256_or_si256(accA, m1A);
520 accB = _mm256_or_si256(accB, m1B);
521 accC = _mm256_or_si256(accC, m1C);
522 accD = _mm256_or_si256(accD, m1D);
523
524 src += 4; dst += 4;
525
526 } while (src < src_end);
527
528 accA = _mm256_or_si256(accA, accB); // A = A | B
529 accC = _mm256_or_si256(accC, accD); // C = C | D
530 accA = _mm256_or_si256(accA, accC); // A = A | C
531
532 return !_mm256_testz_si256(accA, accA);
533}
534
535/*!
536 @brief AND block digest stride
537 *dst &= *src
538
539 @return true if stide is all zero
540 @ingroup AVX2
541*/
542inline
543bool avx2_and_digest(__m256i* BMRESTRICT dst,
544 const __m256i* BMRESTRICT src)
545{
546 __m256i m1A, m1B, m1C, m1D;
547
548 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
549 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
550 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
551 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
552
553 _mm256_store_si256(dst+0, m1A);
554 _mm256_store_si256(dst+1, m1B);
555 _mm256_store_si256(dst+2, m1C);
556 _mm256_store_si256(dst+3, m1D);
557
558 m1A = _mm256_or_si256(m1A, m1B);
559 m1C = _mm256_or_si256(m1C, m1D);
560 m1A = _mm256_or_si256(m1A, m1C);
561
562 return _mm256_testz_si256(m1A, m1A);
563}
564
565/*!
566 @brief AND block digest stride 2 way
567 *dst = *src1 & *src2
568
569 @return true if stide is all zero
570 @ingroup AVX2
571*/
572inline
574 const __m256i* BMRESTRICT src1,
575 const __m256i* BMRESTRICT src2)
576{
577 __m256i m1A, m1B, m1C, m1D;
578
579 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
580 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
581 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
582 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
583
584 _mm256_store_si256(dst+0, m1A);
585 _mm256_store_si256(dst+1, m1B);
586 _mm256_store_si256(dst+2, m1C);
587 _mm256_store_si256(dst+3, m1D);
588
589 m1A = _mm256_or_si256(m1A, m1B);
590 m1C = _mm256_or_si256(m1C, m1D);
591 m1A = _mm256_or_si256(m1A, m1C);
592
593 return _mm256_testz_si256(m1A, m1A);
594}
595
596/*!
597 @brief AND-OR block digest stride 2 way
598 *dst |= *src1 & *src2
599
600 @return true if stide is all zero
601 @ingroup AVX2
602*/
603inline
605 const __m256i* BMRESTRICT src1,
606 const __m256i* BMRESTRICT src2)
607{
608 const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
609
610 __m256i m1A, m1B, m1C, m1D;
611 __m256i mACC1;
612 __m256i mSA, mSB, mSC, mSD;
613
614
615 mSA = _mm256_load_si256(dst+0);
616 mSB = _mm256_load_si256(dst+1);
617 mACC1 = _mm256_and_si256(mSA, mSB);
618
619 mSC = _mm256_load_si256(dst+2);
620 mSD = _mm256_load_si256(dst+3);
621
622 mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));
623
624 mACC1 = _mm256_xor_si256(mACC1, maskF);
625 if (_mm256_testz_si256(mACC1, mACC1)) // whole wave is saturated 1111s already
626 return false;
627
628
629 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
630 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
631 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
632 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
633
634 mACC1 =
635 _mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));
636 bool all_z = _mm256_testz_si256(mACC1, mACC1);
637 if (all_z)
638 return all_z;
639
640 m1A = _mm256_or_si256(mSA, m1A);
641 m1B = _mm256_or_si256(mSB, m1B);
642 m1C = _mm256_or_si256(mSC, m1C);
643 m1D = _mm256_or_si256(mSD, m1D);
644
645 _mm256_store_si256(dst+0, m1A);
646 _mm256_store_si256(dst+1, m1B);
647 _mm256_store_si256(dst+2, m1C);
648 _mm256_store_si256(dst+3, m1D);
649
650 return all_z;
651}
652
653
654/*!
655 @brief AND block digest stride
656 @ingroup AVX2
657*/
658inline
660 const __m256i* BMRESTRICT src1,
661 const __m256i* BMRESTRICT src2,
662 const __m256i* BMRESTRICT src3,
663 const __m256i* BMRESTRICT src4)
664{
665 __m256i m1A, m1B, m1C, m1D;
666 __m256i m1E, m1F, m1G, m1H;
667
668 {
669 __m256i s1_0, s2_0, s1_1, s2_1;
670
671 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
672 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
673 m1A = _mm256_and_si256(s1_0, s2_0);
674 m1B = _mm256_and_si256(s1_1, s2_1);
675 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
676 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
677 m1C = _mm256_and_si256(s1_0, s2_0);
678 m1D = _mm256_and_si256(s1_1, s2_1);
679 }
680 {
681 __m256i s3_0, s4_0, s3_1, s4_1;
682
683 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
684 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
685 m1E = _mm256_and_si256(s3_0, s4_0);
686 m1F = _mm256_and_si256(s3_1, s4_1);
687
688 m1A = _mm256_and_si256(m1A, m1E);
689 m1B = _mm256_and_si256(m1B, m1F);
690
691 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
692 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
693 m1G = _mm256_and_si256(s3_0, s4_0);
694 m1H = _mm256_and_si256(s3_1, s4_1);
695 }
696 {
697 __m256i dst0, dst1;
698 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
699
700 m1C = _mm256_and_si256(m1C, m1G);
701 m1D = _mm256_and_si256(m1D, m1H);
702 m1A = _mm256_and_si256(m1A, dst0);
703 m1B = _mm256_and_si256(m1B, dst1);
704
705 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
706
707 m1C = _mm256_and_si256(m1C, dst0);
708 m1D = _mm256_and_si256(m1D, dst1);
709 }
710 _mm256_store_si256(dst + 0, m1A);
711 _mm256_store_si256(dst + 1, m1B);
712 _mm256_store_si256(dst + 2, m1C);
713 _mm256_store_si256(dst + 3, m1D);
714
715 m1A = _mm256_or_si256(m1A, m1B);
716 m1C = _mm256_or_si256(m1C, m1D);
717 m1A = _mm256_or_si256(m1A, m1C);
718
719 return _mm256_testz_si256(m1A, m1A);
720}
721
722/*!
723 @brief AND array elements against another array (unaligned)
724 *dst &= *src
725 @return 0 if destination does not have any bits
726 @ingroup AVX2
727*/
728inline
729unsigned avx2_and_arr_unal(__m256i* BMRESTRICT dst,
730 const __m256i* BMRESTRICT src,
731 const __m256i* BMRESTRICT src_end)
732{
733 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
734 __m256i accA, accB, accC, accD;
735
736 accA = _mm256_setzero_si256();
737 accB = _mm256_setzero_si256();
738 accC = _mm256_setzero_si256();
739 accD = _mm256_setzero_si256();
740
741 do
742 {
743 m1A = _mm256_loadu_si256(src+0);
744 m2A = _mm256_load_si256(dst+0);
745 m1A = _mm256_and_si256(m1A, m2A);
746 _mm256_store_si256(dst+0, m1A);
747 accA = _mm256_or_si256(accA, m1A);
748
749 m1B = _mm256_loadu_si256(src+1);
750 m2B = _mm256_load_si256(dst+1);
751 m1B = _mm256_and_si256(m1B, m2B);
752 _mm256_store_si256(dst+1, m1B);
753 accB = _mm256_or_si256(accB, m1B);
754
755 m1C = _mm256_loadu_si256(src+2);
756 m2C = _mm256_load_si256(dst+2);
757 m1C = _mm256_and_si256(m1C, m2C);
758 _mm256_store_si256(dst+2, m1C);
759 accC = _mm256_or_si256(accC, m1C);
760
761 m1D = _mm256_loadu_si256(src+3);
762 m2D = _mm256_load_si256(dst+3);
763 m1D = _mm256_and_si256(m1D, m2D);
764 _mm256_store_si256(dst+3, m1D);
765 accD = _mm256_or_si256(accD, m1D);
766
767 src += 4; dst += 4;
768
769 } while (src < src_end);
770
771 accA = _mm256_or_si256(accA, accB); // A = A | B
772 accC = _mm256_or_si256(accC, accD); // C = C | D
773 accA = _mm256_or_si256(accA, accC); // A = A | C
774
775 return !_mm256_testz_si256(accA, accA);
776}
777
778
779/*!
780 @brief OR array elements against another array
781 *dst |= *src
782 @return true if all bits are 1
783
784 @ingroup AVX2
785*/
786inline
787bool avx2_or_block(__m256i* BMRESTRICT dst,
788 const __m256i* BMRESTRICT src)
789{
790 __m256i m1A, m1B, m1C, m1D;
791
792 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
793 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
794
795 __m256i* BMRESTRICT dst2 =
796 (__m256i*)((bm::word_t*)(dst) + bm::set_block_size/2);
797 const __m256i* BMRESTRICT src2 =
798 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size/2);
799 const __m256i* BMRESTRICT src_end =
800 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
801 do
802 {
803 m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
804 m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
805 mAccF0 = _mm256_and_si256(mAccF0, m1A);
806 mAccF0 = _mm256_and_si256(mAccF0, m1B);
807
808 _mm256_stream_si256(dst, m1A);
809 _mm256_stream_si256(dst+1, m1B);
810
811 src += 2; dst += 2;
812
813 m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));
814 m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));
815 mAccF1 = _mm256_and_si256(mAccF1, m1C);
816 mAccF1 = _mm256_and_si256(mAccF1, m1D);
817
818 _mm256_stream_si256(dst2, m1C);
819 _mm256_stream_si256(dst2+1, m1D);
820
821 src2 += 2; dst2 += 2;
822 } while (src2 < src_end);
823
824 __m256i maskF = _mm256_set1_epi32(~0u);
825 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
826 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
827 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
828 return (maskA == ~0u);
829}
830
831
832/*!
833 @brief OR array elements against another unaligned array
834 *dst |= *src
835 @return true if all bits are 1
836
837 @ingroup AVX2
838*/
839inline
840bool avx2_or_arr_unal(__m256i* BMRESTRICT dst,
841 const __m256i* BMRESTRICT src,
842 const __m256i* BMRESTRICT src_end)
843{
844 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
845 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
846 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
847 do
848 {
849 m1A = _mm256_loadu_si256(src+0);
850 m2A = _mm256_load_si256(dst+0);
851 m1A = _mm256_or_si256(m1A, m2A);
852 _mm256_store_si256(dst+0, m1A);
853
854 m1B = _mm256_loadu_si256(src+1);
855 m2B = _mm256_load_si256(dst+1);
856 m1B = _mm256_or_si256(m1B, m2B);
857 _mm256_store_si256(dst+1, m1B);
858
859 m1C = _mm256_loadu_si256(src+2);
860 m2C = _mm256_load_si256(dst+2);
861 m1C = _mm256_or_si256(m1C, m2C);
862 _mm256_store_si256(dst+2, m1C);
863
864 m1D = _mm256_loadu_si256(src+3);
865 m2D = _mm256_load_si256(dst+3);
866 m1D = _mm256_or_si256(m1D, m2D);
867 _mm256_store_si256(dst+3, m1D);
868
869 mAccF1 = _mm256_and_si256(mAccF1, m1C);
870 mAccF1 = _mm256_and_si256(mAccF1, m1D);
871 mAccF0 = _mm256_and_si256(mAccF0, m1A);
872 mAccF0 = _mm256_and_si256(mAccF0, m1B);
873
874 src += 4; dst += 4;
875
876 } while (src < src_end);
877
878 __m256i maskF = _mm256_set1_epi32(~0u);
879 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
880 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
881 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
882 return (maskA == ~0u);
883}
884
885/*!
886 @brief OR 2 arrays and copy to the destination
887 *dst = *src1 | src2
888 @return true if all bits are 1
889
890 @ingroup AVX2
891*/
892inline
894 const __m256i* BMRESTRICT src1,
895 const __m256i* BMRESTRICT src2)
896{
897 __m256i m1A, m1B, m1C, m1D;
898 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
899 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
900 const __m256i* BMRESTRICT src_end1 =
901 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
902
903 do
904 {
905 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
906 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
907 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
908 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
909
910 _mm256_store_si256(dst+0, m1A);
911 _mm256_store_si256(dst+1, m1B);
912 _mm256_store_si256(dst+2, m1C);
913 _mm256_store_si256(dst+3, m1D);
914
915 mAccF1 = _mm256_and_si256(mAccF1, m1C);
916 mAccF1 = _mm256_and_si256(mAccF1, m1D);
917 mAccF0 = _mm256_and_si256(mAccF0, m1A);
918 mAccF0 = _mm256_and_si256(mAccF0, m1B);
919
920 src1 += 4; src2 += 4; dst += 4;
921
922 } while (src1 < src_end1);
923
924 __m256i maskF = _mm256_set1_epi32(~0u);
925 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
926 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
927 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
928 return (maskA == ~0u);
929}
930
931/*!
932 @brief OR array elements against another 2 arrays
933 *dst |= *src1 | src2
934 @return true if all bits are 1
935
936 @ingroup AVX2
937*/
938inline
940 const __m256i* BMRESTRICT src1,
941 const __m256i* BMRESTRICT src2)
942{
943 __m256i m1A, m1B, m1C, m1D;
944 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
945 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
946 const __m256i* BMRESTRICT src_end1 =
947 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
948
949 do
950 {
951 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
952 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
953 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
954 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
955
956 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
957 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
958 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
959 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
960
961 _mm256_store_si256(dst+0, m1A);
962 _mm256_store_si256(dst+1, m1B);
963 _mm256_store_si256(dst+2, m1C);
964 _mm256_store_si256(dst+3, m1D);
965
966 mAccF1 = _mm256_and_si256(mAccF1, m1C);
967 mAccF1 = _mm256_and_si256(mAccF1, m1D);
968 mAccF0 = _mm256_and_si256(mAccF0, m1A);
969 mAccF0 = _mm256_and_si256(mAccF0, m1B);
970
971 src1 += 4; src2 += 4; dst += 4;
972
973 } while (src1 < src_end1);
974
975 __m256i maskF = _mm256_set1_epi32(~0u);
976 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
977 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
978 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
979 return (maskA == ~0u);
980}
981
982
983/*!
984 @brief OR array elements against another 4 arrays
985 *dst |= *src1 | src2
986 @return true if all bits are 1
987
988 @ingroup AVX2
989*/
990inline
992 const __m256i* BMRESTRICT src1,
993 const __m256i* BMRESTRICT src2,
994 const __m256i* BMRESTRICT src3,
995 const __m256i* BMRESTRICT src4)
996{
997 __m256i m1A, m1B, m1C, m1D;
998 __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
999 __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
1000
1001 const __m256i* BMRESTRICT src_end1 =
1002 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
1003
1004 do
1005 {
1006 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1007 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1008 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1009 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1010
1011 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1012 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1013 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1014 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1015
1016 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));
1017 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));
1018 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));
1019 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));
1020
1021 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));
1022 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));
1023 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));
1024 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));
1025
1026 _mm256_stream_si256(dst+0, m1A);
1027 _mm256_stream_si256(dst+1, m1B);
1028 _mm256_stream_si256(dst+2, m1C);
1029 _mm256_stream_si256(dst+3, m1D);
1030
1031 mAccF1 = _mm256_and_si256(mAccF1, m1C);
1032 mAccF1 = _mm256_and_si256(mAccF1, m1D);
1033 mAccF0 = _mm256_and_si256(mAccF0, m1A);
1034 mAccF0 = _mm256_and_si256(mAccF0, m1B);
1035
1036 src1 += 4; src2 += 4;
1037 src3 += 4; src4 += 4;
1038 _mm_prefetch ((const char*)src3, _MM_HINT_T0);
1039 _mm_prefetch ((const char*)src4, _MM_HINT_T0);
1040
1041 dst += 4;
1042
1043 } while (src1 < src_end1);
1044
1045 __m256i maskF = _mm256_set1_epi32(~0u);
1046 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1047 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1048 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1049 return (maskA == ~0u);
1050}
1051
1052
1053/*!
1054 @brief XOR block against another
1055 *dst ^= *src
1056 @return 0 if destination does not have any bits
1057 @ingroup AVX2
1058*/
1059inline
1060unsigned avx2_xor_block(__m256i* BMRESTRICT dst,
1061 const __m256i* BMRESTRICT src)
1062{
1063 __m256i m1A, m1B, m1C, m1D;
1064 __m256i accA, accB, accC, accD;
1065
1066 const __m256i* BMRESTRICT src_end =
1067 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1068
1069 accA = accB = accC = accD = _mm256_setzero_si256();
1070
1071 do
1072 {
1073 m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1074 m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1075 m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1076 m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1077
1078 _mm256_store_si256(dst+0, m1A);
1079 _mm256_store_si256(dst+1, m1B);
1080 _mm256_store_si256(dst+2, m1C);
1081 _mm256_store_si256(dst+3, m1D);
1082
1083 accA = _mm256_or_si256(accA, m1A);
1084 accB = _mm256_or_si256(accB, m1B);
1085 accC = _mm256_or_si256(accC, m1C);
1086 accD = _mm256_or_si256(accD, m1D);
1087
1088 src += 4; dst += 4;
1089
1090 } while (src < src_end);
1091
1092 accA = _mm256_or_si256(accA, accB); // A = A | B
1093 accC = _mm256_or_si256(accC, accD); // C = C | D
1094 accA = _mm256_or_si256(accA, accC); // A = A | C
1095
1096 return !_mm256_testz_si256(accA, accA);
1097}
1098
1099/*!
1100 @brief 3 operand XOR
1101 *dst = *src1 ^ src2
1102 @return 0 if destination does not have any bits
1103 @ingroup AVX2
1104*/
1105inline
1106unsigned avx2_xor_block_2way(__m256i* BMRESTRICT dst,
1107 const __m256i* BMRESTRICT src1,
1108 const __m256i* BMRESTRICT src2)
1109{
1110 __m256i m1A, m1B, m1C, m1D;
1111 __m256i accA, accB, accC, accD;
1112
1113 const __m256i* BMRESTRICT src1_end =
1114 (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
1115
1116 accA = accB = accC = accD = _mm256_setzero_si256();
1117
1118 do
1119 {
1120 m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));
1121 m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));
1122 m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));
1123 m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));
1124
1125 _mm256_store_si256(dst + 0, m1A);
1126 _mm256_store_si256(dst + 1, m1B);
1127 _mm256_store_si256(dst + 2, m1C);
1128 _mm256_store_si256(dst + 3, m1D);
1129
1130 accA = _mm256_or_si256(accA, m1A);
1131 accB = _mm256_or_si256(accB, m1B);
1132 accC = _mm256_or_si256(accC, m1C);
1133 accD = _mm256_or_si256(accD, m1D);
1134
1135 src1 += 4; src2 += 4; dst += 4;
1136
1137 } while (src1 < src1_end);
1138
1139 accA = _mm256_or_si256(accA, accB); // A = A | B
1140 accC = _mm256_or_si256(accC, accD); // C = C | D
1141 accA = _mm256_or_si256(accA, accC); // A = A | C
1142
1143 return !_mm256_testz_si256(accA, accA);
1144}
1145
1146
1147/*!
1148 @brief AND-NOT (SUB) array elements against another array
1149 *dst &= ~*src
1150
1151 @return 0 if destination does not have any bits
1152
1153 @ingroup AVX2
1154*/
1155inline
1156unsigned avx2_sub_block(__m256i* BMRESTRICT dst,
1157 const __m256i* BMRESTRICT src)
1158{
1159 __m256i m1A, m1B, m1C, m1D;
1160 __m256i accA, accB, accC, accD;
1161
1162 accA = accB = accC = accD = _mm256_setzero_si256();
1163
1164 const __m256i* BMRESTRICT src_end =
1165 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1166
1167 do
1168 {
1169 m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
1170 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1171 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1172 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1173
1174 _mm256_store_si256(dst+2, m1C);
1175 _mm256_store_si256(dst+3, m1D);
1176 _mm256_store_si256(dst+0, m1A);
1177 _mm256_store_si256(dst+1, m1B);
1178
1179 accA = _mm256_or_si256(accA, m1A);
1180 accB = _mm256_or_si256(accB, m1B);
1181 accC = _mm256_or_si256(accC, m1C);
1182 accD = _mm256_or_si256(accD, m1D);
1183
1184 src += 4; dst += 4;
1185 } while (src < src_end);
1186
1187 accA = _mm256_or_si256(accA, accB); // A = A | B
1188 accC = _mm256_or_si256(accC, accD); // C = C | D
1189 accA = _mm256_or_si256(accA, accC); // A = A | C
1190
1191 return !_mm256_testz_si256(accA, accA);
1192}
1193
1194/*!
1195 @brief SUB (AND NOT) block digest stride
1196 *dst &= ~*src
1197
1198 @return true if stide is all zero
1199 @ingroup AVX2
1200*/
1201inline
1202bool avx2_sub_digest(__m256i* BMRESTRICT dst,
1203 const __m256i* BMRESTRICT src)
1204{
1205 __m256i m1A, m1B, m1C, m1D;
1206
1207 m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1208 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1209 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1210 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1211
1212 _mm256_store_si256(dst+0, m1A);
1213 _mm256_store_si256(dst+1, m1B);
1214 _mm256_store_si256(dst+2, m1C);
1215 _mm256_store_si256(dst+3, m1D);
1216
1217 m1A = _mm256_or_si256(m1A, m1B);
1218 m1C = _mm256_or_si256(m1C, m1D);
1219 m1A = _mm256_or_si256(m1A, m1C);
1220
1221 return _mm256_testz_si256(m1A, m1A);
1222}
1223
1224/*!
1225 @brief 2-operand SUB (AND NOT) block digest stride
1226 *dst = *src1 & ~*src2
1227
1228 @return true if stide is all zero
1229 @ingroup AVX2
1230*/
1231inline
1233 const __m256i* BMRESTRICT src1,
1234 const __m256i* BMRESTRICT src2)
1235{
1236 __m256i m1A, m1B, m1C, m1D;
1237
1238 m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));
1239 m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));
1240 m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));
1241 m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));
1242
1243 _mm256_store_si256(dst+0, m1A);
1244 _mm256_store_si256(dst+1, m1B);
1245 _mm256_store_si256(dst+2, m1C);
1246 _mm256_store_si256(dst+3, m1D);
1247
1248 m1A = _mm256_or_si256(m1A, m1B);
1249 m1C = _mm256_or_si256(m1C, m1D);
1250 m1A = _mm256_or_si256(m1A, m1C);
1251
1252 return _mm256_testz_si256(m1A, m1A);
1253}
1254
1255
1256
1257/*!
1258 @brief AVX2 block memset
1259 *dst = value
1260
1261 @ingroup AVX2
1262*/
1264void avx2_set_block(__m256i* BMRESTRICT dst, bm::word_t value)
1265{
1266 __m256i* BMRESTRICT dst_end =
1267 (__m256i*)((bm::word_t*)(dst) + bm::set_block_size);
1268
1269 __m256i ymm0 = _mm256_set1_epi32(int(value));
1270 do
1271 {
1272 _mm256_store_si256(dst, ymm0);
1273 _mm256_store_si256(dst+1, ymm0);
1274 _mm256_store_si256(dst+2, ymm0);
1275 _mm256_store_si256(dst+3, ymm0);
1276
1277 dst += 4;
1278 } while (dst < dst_end);
1279}
1280
1281
1282
1283/*!
1284 @brief AVX2 block copy
1285 *dst = *src
1286
1287 @ingroup AVX2
1288*/
1289inline
1290void avx2_copy_block(__m256i* BMRESTRICT dst,
1291 const __m256i* BMRESTRICT src)
1292{
1293 __m256i ymm0, ymm1, ymm2, ymm3;
1294
1295 const __m256i* BMRESTRICT src_end =
1296 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1297
1298 do
1299 {
1300 ymm0 = _mm256_load_si256(src+0);
1301 ymm1 = _mm256_load_si256(src+1);
1302 ymm2 = _mm256_load_si256(src+2);
1303 ymm3 = _mm256_load_si256(src+3);
1304
1305 _mm256_store_si256(dst+0, ymm0);
1306 _mm256_store_si256(dst+1, ymm1);
1307 _mm256_store_si256(dst+2, ymm2);
1308 _mm256_store_si256(dst+3, ymm3);
1309
1310 ymm0 = _mm256_load_si256(src+4);
1311 ymm1 = _mm256_load_si256(src+5);
1312 ymm2 = _mm256_load_si256(src+6);
1313 ymm3 = _mm256_load_si256(src+7);
1314
1315 _mm256_store_si256(dst+4, ymm0);
1316 _mm256_store_si256(dst+5, ymm1);
1317 _mm256_store_si256(dst+6, ymm2);
1318 _mm256_store_si256(dst+7, ymm3);
1319
1320 src += 8; dst += 8;
1321
1322 } while (src < src_end);
1323}
1324
1325/*!
1326 @brief AVX2 block copy (unaligned SRC)
1327 *dst = *src
1328
1329 @ingroup AVX2
1330*/
1331inline
1333 const __m256i* BMRESTRICT src)
1334{
1335 __m256i ymm0, ymm1, ymm2, ymm3;
1336
1337 const __m256i* BMRESTRICT src_end =
1338 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1339
1340 do
1341 {
1342 ymm0 = _mm256_loadu_si256(src+0);
1343 ymm1 = _mm256_loadu_si256(src+1);
1344 ymm2 = _mm256_loadu_si256(src+2);
1345 ymm3 = _mm256_loadu_si256(src+3);
1346
1347 _mm256_store_si256(dst+0, ymm0);
1348 _mm256_store_si256(dst+1, ymm1);
1349 _mm256_store_si256(dst+2, ymm2);
1350 _mm256_store_si256(dst+3, ymm3);
1351
1352 ymm0 = _mm256_loadu_si256(src+4);
1353 ymm1 = _mm256_loadu_si256(src+5);
1354 ymm2 = _mm256_loadu_si256(src+6);
1355 ymm3 = _mm256_loadu_si256(src+7);
1356
1357 _mm256_store_si256(dst+4, ymm0);
1358 _mm256_store_si256(dst+5, ymm1);
1359 _mm256_store_si256(dst+6, ymm2);
1360 _mm256_store_si256(dst+7, ymm3);
1361
1362 src += 8; dst += 8;
1363
1364 } while (src < src_end);
1365}
1366
1367
1368
1369/*!
1370 @brief AVX2 block copy
1371 *dst = *src
1372
1373 @ingroup AVX2
1374*/
1375inline
1377 const __m256i* BMRESTRICT src)
1378{
1379 __m256i ymm0, ymm1, ymm2, ymm3;
1380
1381 const __m256i* BMRESTRICT src_end =
1382 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1383
1384 do
1385 {
1386 ymm0 = _mm256_load_si256(src+0);
1387 ymm1 = _mm256_load_si256(src+1);
1388 ymm2 = _mm256_load_si256(src+2);
1389 ymm3 = _mm256_load_si256(src+3);
1390
1391 _mm256_stream_si256(dst+0, ymm0);
1392 _mm256_stream_si256(dst+1, ymm1);
1393 _mm256_stream_si256(dst+2, ymm2);
1394 _mm256_stream_si256(dst+3, ymm3);
1395
1396 ymm0 = _mm256_load_si256(src+4);
1397 ymm1 = _mm256_load_si256(src+5);
1398 ymm2 = _mm256_load_si256(src+6);
1399 ymm3 = _mm256_load_si256(src+7);
1400
1401 _mm256_stream_si256(dst+4, ymm0);
1402 _mm256_stream_si256(dst+5, ymm1);
1403 _mm256_stream_si256(dst+6, ymm2);
1404 _mm256_stream_si256(dst+7, ymm3);
1405
1406 src += 8; dst += 8;
1407
1408 } while (src < src_end);
1409}
1410
1411/*!
1412 @brief AVX2 block copy (unaligned SRC)
1413 *dst = *src
1414
1415 @ingroup AVX2
1416*/
1417inline
1419 const __m256i* BMRESTRICT src)
1420{
1421 __m256i ymm0, ymm1, ymm2, ymm3;
1422
1423 const __m256i* BMRESTRICT src_end =
1424 (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1425
1426 do
1427 {
1428 ymm0 = _mm256_loadu_si256(src+0);
1429 ymm1 = _mm256_loadu_si256(src+1);
1430 ymm2 = _mm256_loadu_si256(src+2);
1431 ymm3 = _mm256_loadu_si256(src+3);
1432
1433 _mm256_stream_si256(dst+0, ymm0);
1434 _mm256_stream_si256(dst+1, ymm1);
1435 _mm256_stream_si256(dst+2, ymm2);
1436 _mm256_stream_si256(dst+3, ymm3);
1437
1438 ymm0 = _mm256_loadu_si256(src+4);
1439 ymm1 = _mm256_loadu_si256(src+5);
1440 ymm2 = _mm256_loadu_si256(src+6);
1441 ymm3 = _mm256_loadu_si256(src+7);
1442
1443 _mm256_stream_si256(dst+4, ymm0);
1444 _mm256_stream_si256(dst+5, ymm1);
1445 _mm256_stream_si256(dst+6, ymm2);
1446 _mm256_stream_si256(dst+7, ymm3);
1447
1448 src += 8; dst += 8;
1449
1450 } while (src < src_end);
1451}
1452
1453
1454
1455/*!
1456 @brief Invert bit-block
1457 *dst = ~*dst
1458 or
1459 *dst ^= *dst
1460
1461 @ingroup AVX2
1462*/
1463inline
1465{
1466 __m256i maskFF = _mm256_set1_epi32(-1); // broadcast 0xFF
1467 const __m256i* BMRESTRICT dst_end =
1468 (const __m256i*)((bm::word_t*)(dst) + bm::set_block_size);
1469
1470 __m256i ymm0, ymm1;
1471 do
1472 {
1473 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);
1474 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);
1475
1476 _mm256_store_si256(dst+0, ymm0);
1477 _mm256_store_si256(dst+1, ymm1);
1478
1479 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);
1480 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);
1481
1482 _mm256_store_si256(dst+2, ymm0);
1483 _mm256_store_si256(dst+3, ymm1);
1484
1485 dst += 4;
1486
1487 } while (dst < dst_end);
1488}
1489
1490/*!
1491 @brief check if block is all zero bits
1492 @ingroup AVX2
1493*/
1494inline
1495bool avx2_is_all_zero(const __m256i* BMRESTRICT block)
1496{
1497 const __m256i* BMRESTRICT block_end =
1498 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1499
1500 do
1501 {
1502 __m256i w0 = _mm256_load_si256(block+0);
1503 __m256i w1 = _mm256_load_si256(block+1);
1504
1505 __m256i wA = _mm256_or_si256(w0, w1);
1506
1507 __m256i w2 = _mm256_load_si256(block+2);
1508 __m256i w3 = _mm256_load_si256(block+3);
1509
1510 __m256i wB = _mm256_or_si256(w2, w3);
1511 wA = _mm256_or_si256(wA, wB);
1512
1513 if (!_mm256_testz_si256(wA, wA))
1514 return false;
1515 block += 4;
1516 } while (block < block_end);
1517 return true;
1518}
1519
1520/*!
1521 @brief check if digest stride is all zero bits
1522 @ingroup AVX2
1523*/
1524inline
1525bool avx2_is_digest_zero(const __m256i* BMRESTRICT block)
1526{
1527 __m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));
1528 __m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));
1529 wA = _mm256_or_si256(wA, wB);
1530
1531 return _mm256_testz_si256(wA, wA);
1532}
1533
1534/*!
1535 @brief set digest stride to 0xFF.. or 0x0 value
1536 @ingroup AVX2
1537*/
1538inline
1539void avx2_block_set_digest(__m256i* dst, unsigned value)
1540{
1541 __m256i mV = _mm256_set1_epi32(int(value));
1542 _mm256_store_si256(dst, mV);
1543 _mm256_store_si256(dst + 1, mV);
1544 _mm256_store_si256(dst + 2, mV);
1545 _mm256_store_si256(dst + 3, mV);
1546}
1547
1548/*!
1549 @brief check if block is all one bits
1550 @return true if all bits are 1
1551 @ingroup AVX2
1552*/
1553inline
1554bool avx2_is_all_one(const __m256i* BMRESTRICT block)
1555{
1556 const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1557 const __m256i* BMRESTRICT block_end =
1558 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1559 do
1560 {
1561 __m256i m1A = _mm256_load_si256(block+0);
1562 __m256i m1B = _mm256_load_si256(block+1);
1563 m1A = _mm256_xor_si256(m1A, maskF);
1564 m1B = _mm256_xor_si256(m1B, maskF);
1565 m1A = _mm256_or_si256(m1A, m1B);
1566 if (!_mm256_testz_si256(m1A, m1A))
1567 return false;
1568 block += 2;
1569 } while (block < block_end);
1570 return true;
1571}
1572
1573/*!
1574 @brief check if wave of pointers is all 0xFFF
1575 @ingroup AVX2
1576*/
1578bool avx2_test_all_one_wave(const void* ptr)
1579{
1580 __m256i maskF = _mm256_set1_epi32(~0u); // braodcast 0xFF
1581 __m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF); // (w0 == maskF)
1582 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1583 return (maskA == ~0u);
1584}
1585
1586
1587/*!
1588 @brief check if wave of pointers is all NULL
1589 @ingroup AVX2
1590*/
1592bool avx2_test_all_zero_wave(const void* ptr)
1593{
1594 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1595 return _mm256_testz_si256(w0, w0);
1596}
1597
1598/*!
1599 @brief check if 2 wave of pointers are all NULL
1600 @ingroup AVX2
1601*/
1603bool avx2_test_all_zero_wave2(const void* ptr0, const void* ptr1)
1604{
1605 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1606 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1607 w0 = _mm256_or_si256(w0, w1);
1608 return _mm256_testz_si256(w0, w0);
1609}
1610
1611/*!
1612 @brief check if 2 wave of pointers are all the same (NULL or FULL)
1613 @ingroup AVX2
1614*/
1616bool avx2_test_all_eq_wave2(const void* ptr0, const void* ptr1)
1617{
1618 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1619 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1620 w0 = _mm256_xor_si256(w0, w1);
1621 return _mm256_testz_si256(w0, w0);
1622}
1623
1624/*!
1625 @brief block shift left by 1
1626 @ingroup AVX2
1627*/
1628inline
1629bool avx2_shift_l1(__m256i* block, bm::word_t* empty_acc, unsigned co1)
1630{
1631 __m256i* block_end =
1632 (__m256i*)((bm::word_t*)(block) + bm::set_block_size);
1633
1634 __m256i m1COshft, m2COshft;
1635 __m256i mAcc = _mm256_set1_epi32(0);
1636 __m256i mMask1 = _mm256_set1_epi32(1);
1637 __m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
1638 unsigned co2;
1639
1640 for (--block_end; block_end >= block; block_end -= 2)
1641 {
1642 __m256i m1A = _mm256_load_si256(block_end);
1643 __m256i m2A = _mm256_load_si256(block_end-1);
1644
1645 __m256i m1CO = _mm256_and_si256(m1A, mMask1);
1646 __m256i m2CO = _mm256_and_si256(m2A, mMask1);
1647
1648 co2 = _mm256_extract_epi32(m1CO, 0);
1649
1650 m1A = _mm256_srli_epi32(m1A, 1); // (block[i] >> 1u)
1651 m2A = _mm256_srli_epi32(m2A, 1);
1652
1653 // shift CO flags using -1 permute indexes, add CO to v[0]
1654 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1655 m1COshft = _mm256_insert_epi32(m1COshft, co1, 7); // v[7] = co_flag
1656
1657 co1 = co2;
1658
1659 co2 = _mm256_extract_epi32(m2CO, 0);
1660
1661 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1662 m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);
1663
1664 m1COshft = _mm256_slli_epi32(m1COshft, 31);
1665 m2COshft = _mm256_slli_epi32(m2COshft, 31);
1666
1667 m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1668 m2A = _mm256_or_si256(m2A, m2COshft);
1669
1670 _mm256_store_si256(block_end, m1A);
1671 _mm256_store_si256(block_end-1, m2A);
1672
1673 mAcc = _mm256_or_si256(mAcc, m1A);
1674 mAcc = _mm256_or_si256(mAcc, m2A);
1675
1676 co1 = co2;
1677
1678 } // for
1679
1680 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1681 return co1;
1682}
1683
1684
1685/*!
1686 @brief block shift right by 1
1687 @ingroup AVX2
1688*/
1689inline
1690bool avx2_shift_r1(__m256i* block, bm::word_t* empty_acc, unsigned co1)
1691{
1692 const __m256i* block_end =
1693 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1694
1695 __m256i m1COshft, m2COshft;
1696 __m256i mAcc = _mm256_set1_epi32(0);
1697 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1698 unsigned co2;
1699
1700 for (;block < block_end; block+=2)
1701 {
1702 __m256i m1A = _mm256_load_si256(block);
1703 __m256i m2A = _mm256_load_si256(block+1);
1704
1705 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1706 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1707
1708 co2 = _mm256_extract_epi32(m1CO, 7);
1709
1710 m1A = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
1711 m2A = _mm256_slli_epi32(m2A, 1);
1712
1713 // shift CO flags using +1 permute indexes, add CO to v[0]
1714 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1715 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
1716
1717 co1 = co2;
1718
1719 co2 = _mm256_extract_epi32(m2CO, 7);
1720 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1721 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1722
1723 m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1724 m2A = _mm256_or_si256(m2A, m2COshft);
1725
1726 _mm256_store_si256(block, m1A);
1727 _mm256_store_si256(block+1, m2A);
1728
1729 mAcc = _mm256_or_si256(mAcc, m1A);
1730 mAcc = _mm256_or_si256(mAcc, m2A);
1731
1732 co1 = co2;
1733 } // for
1734
1735 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1736 return co1;
1737}
1738
1739
1740/*!
1741 @brief fused block shift right by 1 plus AND
1742 @ingroup AVX2
1743*/
1744
1745inline
1746bool avx2_shift_r1_and(__m256i* BMRESTRICT block,
1747 bm::word_t co1,
1748 const __m256i* BMRESTRICT mask_block,
1749 bm::id64_t* BMRESTRICT digest)
1750{
1751 BM_ASSERT(*digest);
1752
1753 bm::word_t* wblock = (bm::word_t*) block;
1754 const bm::word_t* mblock = (const bm::word_t*) mask_block;
1755
1756 __m256i m1COshft, m2COshft;
1757 __m256i mAcc = _mm256_set1_epi32(0);
1758 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1759 unsigned co2;
1760
1761 bm::id64_t d, wd;
1762 wd = d = *digest;
1763 unsigned di = co1 ? 0 : unsigned(_tzcnt_u64(d)); // get first set bit
1764 for (; di < 64 ; ++di)
1765 {
1766 const unsigned d_base = di * bm::set_block_digest_wave_size;
1767 const bm::id64_t dmask = (1ull << di);
1768 if (d & dmask) // digest stride NOT empty
1769 {
1770 mAcc = _mm256_xor_si256(mAcc, mAcc); // mAcc = 0
1771
1772 mask_block = (__m256i*) &mblock[d_base];
1773 _mm_prefetch ((const char*)mask_block, _MM_HINT_NTA);
1774
1775 block = (__m256i*) &wblock[d_base];
1776
1777 for (unsigned i = 0; i < 2; ++i, block += 2, mask_block += 2)
1778 {
1779 __m256i m1A = _mm256_load_si256(block);
1780 __m256i m2A = _mm256_load_si256(block+1);
1781
1782 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1783 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1784
1785 co2 = _mm256_extract_epi32(m1CO, 7);
1786
1787 m1A = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
1788 m2A = _mm256_slli_epi32(m2A, 1);
1789
1790 __m256i m1M = _mm256_load_si256(mask_block);
1791 __m256i m2M = _mm256_load_si256(mask_block+1);
1792
1793 // shift CO flags using +1 permute indexes, add CO to v[0]
1794 m1COshft = _mm256_insert_epi32(
1795 _mm256_permutevar8x32_epi32(m1CO, mCOidx),
1796 co1, 0); // v[0] = co_flag
1797
1798 co1 = co2;
1799 co2 = _mm256_extract_epi32(m2CO, 7);
1800 m2COshft = _mm256_insert_epi32(
1801 _mm256_permutevar8x32_epi32(m2CO, mCOidx),
1802 co1, 0);
1803
1804 m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1805 m2A = _mm256_or_si256(m2A, m2COshft);
1806
1807 m1A = _mm256_and_si256(m1A, m1M); // block[i] &= mask_block[i]
1808 m2A = _mm256_and_si256(m2A, m2M);
1809
1810 _mm256_store_si256(block, m1A);
1811 _mm256_store_si256(block+1, m2A);
1812
1813 mAcc = _mm256_or_si256(mAcc, m1A);
1814 mAcc = _mm256_or_si256(mAcc, m2A);
1815
1816 co1 = co2;
1817
1818 } // for i
1819
1820 if (_mm256_testz_si256(mAcc, mAcc)) // test if OR accum is zero
1821 d &= ~~dmask; // clear the digest bit
1822
1823 wd = _blsr_u64(wd); // wd &= wd - 1; // reset lowest set bit
1824 }
1825 else // stride is empty
1826 {
1827 if (co1)
1828 {
1829 BM_ASSERT(co1 == 1);
1830 BM_ASSERT(wblock[d_base] == 0);
1831
1832 bm::id64_t w0 = wblock[d_base] = (co1 & mblock[d_base]);
1833 d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1834 co1 = 0;
1835 }
1836 if (!wd) // digest is empty, no CO -> exit
1837 break;
1838 }
1839 } // for di
1840
1841 *digest = d;
1842 return co1;
1843}
1844
1845
1846
1847/*
1848inline
1849void avx2_i32_shift()
1850{
1851 unsigned shift_in = 80;
1852
1853 __m256i mTest = _mm256_set_epi32(70, 60, 50, 40, 30, 20, 10, 100);
1854 __m256i mIdx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1855
1856 __m256i m1shft = _mm256_permutevar8x32_epi32(mTest, mIdx);
1857 m1shft = _mm256_insert_epi32(m1shft, shift_in, 0);
1858
1859 avx2_print256("m1shft=", m1shft);
1860}
1861*/
1862
1863
1864
1865/*!
1866 AVX2 calculate number of bit changes from 0 to 1
1867 @ingroup AVX2
1868*/
1869inline
1870unsigned avx2_bit_block_calc_change(const __m256i* BMRESTRICT block,
1871 unsigned size)
1872{
1874
1875 const __m256i* block_end =
1876 (const __m256i*)((bm::word_t*)(block) + size);
1877
1878 __m256i m1COshft, m2COshft;
1879 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1880 __m256i cntAcc = _mm256_setzero_si256();
1881
1882 unsigned w0 = *((bm::word_t*)(block));
1883 unsigned count = 1;
1884
1886
1887 unsigned co2, co1 = 0;
1888 for (;block < block_end; block+=2)
1889 {
1890 __m256i m1A = _mm256_load_si256(block);
1891 __m256i m2A = _mm256_load_si256(block+1);
1892
1893 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1894 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1895
1896 co2 = _mm256_extract_epi32(m1CO, 7);
1897
1898 __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
1899 __m256i m2As = _mm256_slli_epi32(m2A, 1);
1900
1901 // shift CO flags using +1 permute indexes, add CO to v[0]
1902 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1903 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
1904
1905 co1 = co2;
1906
1907 co2 = _mm256_extract_epi32(m2CO, 7);
1908 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1909 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1910
1911 m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
1912 m2As = _mm256_or_si256(m2As, m2COshft);
1913
1914 co1 = co2;
1915
1916 // we now have two shifted AVX2 regs with carry-over
1917 m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
1918 m2A = _mm256_xor_si256(m2A, m2As);
1919
1920 {
1921 BM_AVX2_BIT_COUNT(bc, m1A)
1922 cntAcc = _mm256_add_epi64(cntAcc, bc);
1923 BM_AVX2_BIT_COUNT(bc, m2A)
1924 cntAcc = _mm256_add_epi64(cntAcc, bc);
1925 }
1926 } // for
1927
1928 // horizontal count sum
1929 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
1930 count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
1931
1932 count -= (w0 & 1u); // correct initial carry-in error
1933 return count;
1934}
1935
1936/*!
1937 AVX2 calculate number of bit changes from 0 to 1 from a XOR product
1938 @ingroup AVX2
1939*/
1940inline
1942 const __m256i* BMRESTRICT xor_block,
1943 unsigned size,
1944 unsigned* BMRESTRICT gcount,
1945 unsigned* BMRESTRICT bcount)
1946{
1948
1949 const __m256i* BMRESTRICT block_end =
1950 (const __m256i*)((bm::word_t*)(block) + size);
1951
1952 __m256i m1COshft, m2COshft;
1953 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1954
1955 __m256i cntAcc = _mm256_setzero_si256();
1956 __m256i cntAcc2 = _mm256_setzero_si256();
1957
1958 unsigned w0 = *((bm::word_t*)(block));
1959 unsigned bit_count = 0;
1960 unsigned gap_count = 1;
1961
1963
1964 unsigned co2, co1 = 0;
1965 for (;block < block_end; block+=2, xor_block+=2)
1966 {
1967 __m256i m1A = _mm256_load_si256(block);
1968 __m256i m2A = _mm256_load_si256(block+1);
1969 __m256i m1B = _mm256_load_si256(xor_block);
1970 __m256i m2B = _mm256_load_si256(xor_block+1);
1971
1972 m1A = _mm256_xor_si256 (m1A, m1B);
1973 m2A = _mm256_xor_si256 (m2A, m2B);
1974
1975 {
1976 BM_AVX2_BIT_COUNT(bc, m1A)
1977 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
1978 BM_AVX2_BIT_COUNT(bc, m2A)
1979 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
1980 }
1981
1982 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1983 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1984
1985 co2 = _mm256_extract_epi32(m1CO, 7);
1986
1987 __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
1988 __m256i m2As = _mm256_slli_epi32(m2A, 1);
1989
1990 // shift CO flags using +1 permute indexes, add CO to v[0]
1991 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1992 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
1993
1994 co1 = co2;
1995
1996 co2 = _mm256_extract_epi32(m2CO, 7);
1997 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1998 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1999
2000 m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2001 m2As = _mm256_or_si256(m2As, m2COshft);
2002
2003 co1 = co2;
2004
2005 // we now have two shifted AVX2 regs with carry-over
2006 m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2007 m2A = _mm256_xor_si256(m2A, m2As);
2008
2009 {
2010 BM_AVX2_BIT_COUNT(bc, m1A)
2011 cntAcc = _mm256_add_epi64(cntAcc, bc);
2012 BM_AVX2_BIT_COUNT(bc, m2A)
2013 cntAcc = _mm256_add_epi64(cntAcc, bc);
2014 }
2015 } // for
2016
2017 // horizontal count sum
2018 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2019 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2020 gap_count -= (w0 & 1u); // correct initial carry-in error
2021 if (!gap_count)
2022 ++gap_count; // always >0
2023
2024 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);
2025 bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2026
2027 *gcount = gap_count;
2028 *bcount = bit_count;
2029}
2030
2031
2032
2033/*!
2034 AVX2 calculate number of bit changes from 0 to 1 and bitcount
2035 @ingroup AVX2
2036*/
2037inline
2039 unsigned* gcount, unsigned* bcount)
2040{
2042
2043 const __m256i* block_end =
2044 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
2045
2046 __m256i m1COshft, m2COshft;
2047 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2048 __m256i cntAcc = _mm256_setzero_si256();
2049
2050 unsigned w0 = *((bm::word_t*)(block));
2051 unsigned bit_count = 0;
2052 unsigned gap_count = 1;
2053
2055
2056 unsigned co2, co1 = 0;
2057 for (;block < block_end; block+=2)
2058 {
2059 __m256i m1A = _mm256_load_si256(block);
2060 __m256i m2A = _mm256_load_si256(block+1);
2061
2062 // popcount
2063 {
2064 bm::id64_t* b64 = (bm::id64_t*)block;
2065
2066 bit_count += (unsigned) (_mm_popcnt_u64(b64[0]) + _mm_popcnt_u64(b64[1]));
2067 bit_count += (unsigned)(_mm_popcnt_u64(b64[2]) + _mm_popcnt_u64(b64[3]));
2068
2069 bit_count += (unsigned)(_mm_popcnt_u64(b64[4]) + _mm_popcnt_u64(b64[5]));
2070 bit_count += (unsigned)(_mm_popcnt_u64(b64[6]) + _mm_popcnt_u64(b64[7]));
2071 }
2072
2073 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2074 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2075
2076 co2 = _mm256_extract_epi32(m1CO, 7);
2077
2078 __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2079 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2080
2081 // shift CO flags using +1 permute indexes, add CO to v[0]
2082 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2083 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2084
2085 co1 = co2;
2086
2087 co2 = _mm256_extract_epi32(m2CO, 7);
2088 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2089 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2090
2091 m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2092 m2As = _mm256_or_si256(m2As, m2COshft);
2093
2094 co1 = co2;
2095
2096 // we now have two shifted AVX2 regs with carry-over
2097 m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2098 m2A = _mm256_xor_si256(m2A, m2As);
2099
2100 {
2101 BM_AVX2_BIT_COUNT(bc, m1A)
2102 cntAcc = _mm256_add_epi64(cntAcc, bc);
2103 BM_AVX2_BIT_COUNT(bc, m2A)
2104 cntAcc = _mm256_add_epi64(cntAcc, bc);
2105 }
2106 } // for
2107
2108 // horizontal count sum
2109 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2110 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2111 gap_count -= (w0 & 1u); // correct initial carry-in error
2112
2113 *gcount = gap_count;
2114 *bcount = bit_count;
2115}
2116
2117
2118/*!
2119 \brief Find first bit which is different between two bit-blocks
2120 @ingroup AVX2
2121*/
2122inline
2123bool avx2_bit_find_first_diff(const __m256i* BMRESTRICT block1,
2124 const __m256i* BMRESTRICT block2,
2125 unsigned* pos)
2126{
2127 unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR;
2128
2129 const __m256i* block1_end =
2130 (const __m256i*)((bm::word_t*)(block1) + bm::set_block_size);
2131 __m256i maskZ = _mm256_setzero_si256();
2132 __m256i mA, mB;
2133 unsigned simd_lane = 0;
2134 do
2135 {
2136 mA = _mm256_xor_si256(_mm256_load_si256(block1),
2137 _mm256_load_si256(block2));
2138 mB = _mm256_xor_si256(_mm256_load_si256(block1+1),
2139 _mm256_load_si256(block2+1));
2140 __m256i mOR = _mm256_or_si256(mA, mB);
2141 if (!_mm256_testz_si256(mOR, mOR)) // test 2x256 lanes
2142 {
2143 if (!_mm256_testz_si256(mA, mA))
2144 {
2145 // invert to fing (w != 0)
2146 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2147 BM_ASSERT(mask);
2148 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2149 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2150 unsigned widx = bsf >> 2; // (bsf / 4);
2151 unsigned w = simd_buf[widx];// _mm256_extract_epi32 (mA, widx);
2152 bsf = bm::bsf_asm32(w); // find first bit != 0
2153 *pos = (simd_lane * 256) + (widx * 32) + bsf;
2154 return true;
2155 }
2156 // invert to fing (w != 0)
2157 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2158 BM_ASSERT(mask);
2159 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2160 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2161 unsigned widx = bsf >> 2; // (bsf / 4);
2162 unsigned w = simd_buf[widx];// _mm256_extract_epi32 (mB, widx);
2163 bsf = bm::bsf_asm32(w); // find first bit != 0
2164 *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2165 return true;
2166 }
2167
2168 simd_lane+=2;
2169 block1+=2; block2+=2;
2170
2171 } while (block1 < block1_end);
2172 return false;
2173}
2174
2175
2176/*!
2177 \brief Find first bit set
2178 @ingroup AVX2
2179*/
2180inline
2181bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned* pos)
2182{
2183 unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR;
2184
2185 const __m256i* block_end =
2186 (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
2187 __m256i maskZ = _mm256_setzero_si256();
2188 __m256i mA, mB;
2189 unsigned simd_lane = 0;
2190 do
2191 {
2192 mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);
2193 __m256i mOR = _mm256_or_si256(mA, mB);
2194 if (!_mm256_testz_si256(mOR, mOR)) // test 2x256 lanes
2195 {
2196 if (!_mm256_testz_si256(mA, mA))
2197 {
2198 // invert to fing (w != 0)
2199 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2200 BM_ASSERT(mask);
2201 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2202 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2203 unsigned widx = bsf >> 2; // (bsf / 4);
2204 unsigned w = simd_buf[widx];
2205 bsf = bm::bsf_asm32(w); // find first bit != 0
2206 *pos = (simd_lane * 256) + (widx * 32) + bsf;
2207 return true;
2208 }
2209 // invert to fing (w != 0)
2210 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2211 BM_ASSERT(mask);
2212 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2213 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2214 unsigned widx = bsf >> 2; // (bsf / 4);
2215 unsigned w = simd_buf[widx];
2216 bsf = bm::bsf_asm32(w); // find first bit != 0
2217 *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2218 return true;
2219 }
2220
2221 simd_lane+=2;
2222 block+=2;
2223
2224 } while (block < block_end);
2225 return false;
2226}
2227
2228
2229
2230/* @brief Gap block population count (array sum) utility
2231 @param pbuf - unrolled, aligned to 1-start GAP buffer
2232 @param avx_vect_waves - number of AVX vector lines to process
2233 @param sum - result acumulator
2234 @return tail pointer
2235
2236 @internal
2237*/
2238inline
2240 unsigned avx_vect_waves,
2241 unsigned* sum)
2242{
2243 __m256i xcnt = _mm256_setzero_si256();
2244
2245 // accumulate odd and even elements of the vector the result is
2246 // correct based on modulus 16 (max element value in gap blocks is 65535)
2247 // overflow is not an issue here
2248 for (unsigned i = 0; i < avx_vect_waves; ++i)
2249 {
2250 __m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
2251 __m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
2252 __m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
2253 xcnt = _mm256_add_epi16(xcnt, ymm_s2);
2254 pbuf += 32;
2255 }
2256 // odd minus even vector elements clears the result for 1111 blocks
2257 // bsrli - byte shifts the vector element by 2 bytes (1 short int)
2258 xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
2259
2260 // horizontal sum of vector elements
2261 // cnt16[0] + cnt16[2] + cnt16[4] + cnt16[6] + cnt16[8] + cnt16[10] + cnt16[12] + cnt16[14];
2262 //
2263 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
2264 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
2265 __m128i xcnt2 = _mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
2266
2267 // extract 32-bit word and mask to take first 16 bits
2268 *sum += _mm_cvtsi128_si32(xcnt2) & 0xffff;
2269 return pbuf;
2270}
2271
2272
2273/*!
2274 AVX2 index lookup to check what belongs to the same block (8 elements)
2275 \internal
2276*/
2277inline
2278unsigned avx2_idx_arr_block_lookup(const unsigned* idx, unsigned size,
2279 unsigned nb, unsigned start)
2280{
2281 const unsigned unroll_factor = 16;
2282 const unsigned len = (size - start);
2283 const unsigned len_unr = len - (len % unroll_factor);
2284 unsigned k;
2285
2286 idx += start;
2287
2288 __m256i nbM = _mm256_set1_epi32(int(nb));
2289
2290 for (k = 0; k < len_unr; k+=unroll_factor)
2291 {
2292 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2293 __m256i nbA = _mm256_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
2294
2295 __m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
2296 if (~0u != unsigned(_mm256_movemask_epi8(wcmpA)))
2297 break;
2298 __m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
2299 __m256i nbB = _mm256_srli_epi32(idxB, bm::set_block_shift);
2300
2301 __m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
2302 if (~0u != unsigned(_mm256_movemask_epi8(wcmpB)))
2303 break;
2304 } // for k
2305 for (; k < len; ++k)
2306 {
2307 if (nb != unsigned(idx[k] >> bm::set_block_shift))
2308 break;
2309 } // for k
2310 return start + k;
2311}
2312
2313
2314/*!
2315 SSE4.2 bulk bit set
2316 \internal
2317*/
2318inline
2320 const unsigned* BMRESTRICT idx,
2321 unsigned start, unsigned stop )
2322{
2323 const unsigned unroll_factor = 8;
2324 const unsigned len = (stop - start);
2325 const unsigned len_unr = len - (len % unroll_factor);
2326
2327 idx += start;
2328
2329 __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2330 __m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
2331 __m256i mask1 = _mm256_set1_epi32(1);
2332 __m256i mask_tmp;
2333
2334 unsigned BM_ALIGN32 mask_v[8] BM_ALIGN32ATTR;
2335 unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
2336
2337 unsigned k = 0, mask, w_idx;
2338 for (; k < len_unr; k+=unroll_factor)
2339 {
2340 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2341 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
2342 __m256i nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
2343
2344 nbitA = _mm256_and_si256 (nbitA, sw_mask); // nbit &= bm::set_word_mask;
2345
2346 __m256i maskA = _mm256_sllv_epi32(mask1, nbitA); // (1 << nbit)
2347
2348 _mm256_store_si256 ((__m256i*)mword_v, nwordA); // store block word idxs
2349
2350 // shufffle + permute to prepare comparison vector
2351 mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
2352 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2353 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2354 if (mask == ~0u) // all idxs belong the same word
2355 {
2356 w_idx = mword_v[0];
2357 mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp); // zero bits
2358 mask_tmp = _mm256_or_si256 (mask_tmp, maskA); // set bits
2359
2360 // horizontal OR via permutation of two 128-bit lanes
2361 // then byte-shifts + OR withing lower 128
2362 __m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);
2363 __m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);
2364 mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);
2365 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4); // shift R by 1 int
2366 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2367 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8); // shift R by 2 ints
2368 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2369
2370 int u0 = _mm256_extract_epi32(mask_tmp, 0); // final OR
2371 block[w_idx] |= u0;
2372 }
2373 else // whole 256-bit lane does NOT hit the same word...
2374 {
2375 _mm256_store_si256 ((__m256i*)mask_v, maskA);
2376
2377 // compute horizonlal OR of set bit mask over lo-hi 128-bit lanes
2378 // it is used later if lo or hi lanes hit the same word
2379 // (probabilistic speculation)
2380 //
2381 int u0, u4;
2382 {
2383 mask_tmp = _mm256_bsrli_epi128(maskA, 4); // shift R by 1 int
2384 mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2385 __m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8); // shift R by 2 ints
2386 mask_tmp = _mm256_or_si256 (m0, mask_tmp);
2387
2388 u0 = _mm256_extract_epi32(mask_tmp, 0); // final OR (128-lo)
2389 u4 = _mm256_extract_epi32(mask_tmp, 4); // final OR (128-hi)
2390 }
2391
2392 // check the lo 128-lane
2393 {
2394 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0); // lo
2395 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0); // copy simd[0]
2396 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2397 if (mask == ~0u) // all idxs belong the same word
2398 {
2399 w_idx = mword_v[0];
2400 block[w_idx] |= u0;
2401 }
2402 else // different block words: use "shotgun" OR
2403 {
2404 block[mword_v[0]] |= mask_v[0];
2405 block[mword_v[1]] |= mask_v[1];
2406 block[mword_v[2]] |= mask_v[2];
2407 block[mword_v[3]] |= mask_v[3];
2408
2409 }
2410 }
2411
2412 // check the hi 128-lane
2413 {
2414 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1); // hi
2415 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2416 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2417 if (mask == ~0u) // all idxs belong the same word
2418 {
2419 w_idx = mword_v[4];
2420 block[w_idx] |= u4;
2421 }
2422 else
2423 {
2424 block[mword_v[4]] |= mask_v[4];
2425 block[mword_v[5]] |= mask_v[5];
2426 block[mword_v[6]] |= mask_v[6];
2427 block[mword_v[7]] |= mask_v[7];
2428 }
2429 }
2430 }
2431 } // for k
2432
2433 for (; k < len; ++k)
2434 {
2435 unsigned n = idx[k];
2436 unsigned nbit = unsigned(n & bm::set_block_mask);
2437 unsigned nword = nbit >> bm::set_word_shift;
2438 nbit &= bm::set_word_mask;
2439 block[nword] |= (1u << nbit);
2440 } // for k
2441}
2442
2443
2444/** Set a bits in an AVX target, by indexes (int4) from the source
2445 @internal
2446*/
2448__m256i avx2_setbit_256(__m256i target, __m256i source)
2449{
2450 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2451 __m256i mask1 = _mm256_set1_epi32(1);
2452
2453 __m256i v0, v1, acc1, acc2;
2454 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(0));
2455 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(1));
2456 v0 = _mm256_sub_epi32(v0, stride_idx);
2457 v1 = _mm256_sub_epi32(v1, stride_idx);
2458 v0 = _mm256_sllv_epi32(mask1, v0);
2459 v1 = _mm256_sllv_epi32(mask1, v1);
2460 acc1 = _mm256_or_si256(v1, v0);
2461 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(2));
2462 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(3));
2463 v0 = _mm256_sub_epi32(v0, stride_idx);
2464 v1 = _mm256_sub_epi32(v1, stride_idx);
2465 v0 = _mm256_sllv_epi32(mask1, v0);
2466 v1 = _mm256_sllv_epi32(mask1, v1);
2467 acc2 = _mm256_or_si256(v1, v0);
2468 target = _mm256_or_si256(target, acc1);
2469 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(4));
2470 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(5));
2471 v0 = _mm256_sub_epi32(v0, stride_idx);
2472 v1 = _mm256_sub_epi32(v1, stride_idx);
2473 v0 = _mm256_sllv_epi32(mask1, v0);
2474 v1 = _mm256_sllv_epi32(mask1, v1);
2475 acc1 = _mm256_or_si256(v1, v0);
2476 target = _mm256_or_si256(target, acc2);
2477 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(6));
2478 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(7));
2479 v0 = _mm256_sub_epi32(v0, stride_idx);
2480 v1 = _mm256_sub_epi32(v1, stride_idx);
2481 v0 = _mm256_sllv_epi32(mask1, v0);
2482 v1 = _mm256_sllv_epi32(mask1, v1);
2483 acc2 = _mm256_or_si256(v1, v0);
2484
2485 target = _mm256_or_si256(target, acc1);
2486 target = _mm256_or_si256(target, acc2);
2487 return target;
2488}
2489
2490
2491/** Experimental code to set bits via AVX strides
2492 @internal
2493*/
2494inline
2496 const unsigned* BMRESTRICT idx,
2497 unsigned start, unsigned stop )
2498{
2499 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2500 __m256i mask1 = _mm256_set1_epi32(1);
2501 __m256i* block_avx = (__m256i*)block;
2502
2503 unsigned stride = 0;
2504 __m256i* avx_stride_p = block_avx + stride;
2505 __m256i blkA = _mm256_load_si256(avx_stride_p);
2506
2507 for (unsigned i = start; i < stop; ++i)
2508 {
2509 unsigned n = idx[i];
2510 unsigned nbit = unsigned(n & bm::set_block_mask);
2511 unsigned new_stride = nbit >> 8; // (nbit / 256)
2512 unsigned stride_bit = nbit & 0xFF; // (nbit % 256)
2513 if (new_stride != stride)
2514 {
2515 _mm256_store_si256(avx_stride_p, blkA); // flush the avx2 accum
2516 stride = new_stride;
2517 avx_stride_p = block_avx + stride;
2518 blkA = _mm256_load_si256(avx_stride_p); // re-load the accum
2519 }
2520 // set avx2 stride bit
2521 __m256i v0 = _mm256_set1_epi32(stride_bit);
2522 __m256i s0 = _mm256_sub_epi32(v0, stride_idx);
2523 __m256i k0 = _mm256_sllv_epi32(mask1, s0);
2524 blkA = _mm256_or_si256(blkA, k0);
2525 } // for i
2526
2527 _mm256_store_si256(avx_stride_p, blkA);
2528}
2529
2530/** Experimental code to set bits via AVX strides
2531 @internal
2532*/
2533inline
2535 const unsigned* BMRESTRICT idx,
2536 unsigned start, unsigned stop )
2537{
2538 const unsigned unroll_factor = 8;
2539 const unsigned len = (stop - start);
2540 const unsigned len_unr = len - (len % unroll_factor);
2541
2542 idx += start;
2543
2544 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2545 __m256i mask1 = _mm256_set1_epi32(1);
2546
2547 __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2548 __m256i stride_bit_mask = _mm256_set1_epi32(0xFF);
2549
2550 unsigned BM_ALIGN32 mstride_v[8] BM_ALIGN32ATTR;
2551 int BM_ALIGN32 mstride_bit_v[8] BM_ALIGN32ATTR;
2552
2553 // define the very first block stride based on index 0
2554 unsigned stride = unsigned(idx[0] & bm::set_block_mask) >> 8;
2555
2556 __m256i* block_avx = (__m256i*)block;
2557 __m256i* avx_stride_p = block_avx + stride;
2558
2559 __m256i blkA = _mm256_load_si256(avx_stride_p); // load the first accum
2560
2561 unsigned k = 0, mask;
2562 for (; k < len_unr; k+=unroll_factor)
2563 {
2564 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2565 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
2566 __m256i strideA = _mm256_srli_epi32 (nbitA, 8); // new_stride = nbit >> 8
2567 __m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask); // stride_bit = nbit & 0xFF;
2568
2569 // construct a cmp vector from broadcasted v[0]
2570 __m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);
2571 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2572 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));
2573 if (mask == ~0u) // all idxs belong the same avx2 stride
2574 {
2575 unsigned new_stride = (unsigned)_mm256_extract_epi32(strideA, 0);
2576 if (new_stride != stride)
2577 {
2578 _mm256_store_si256(avx_stride_p, blkA); // flush avx2 accum
2579 stride = new_stride;
2580 avx_stride_p = block_avx + stride;
2581 blkA = _mm256_load_si256(avx_stride_p); // re-load accum
2582 }
2583 // set 8 bits all at once
2584 blkA = bm::avx2_setbit_256(blkA, strideBitA);
2585 }
2586 else // stride mix here, process one by one
2587 {
2588 _mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA); // store block stride-bit idxs
2589 _mm256_store_si256 ((__m256i*)mstride_v, strideA);
2590 for (unsigned j = 0; j < 8; ++j)
2591 {
2592 unsigned new_stride = mstride_v[j];
2593 if (new_stride != stride)
2594 {
2595 _mm256_store_si256(avx_stride_p, blkA); // flush avx2 accum
2596 stride = new_stride;
2597 avx_stride_p = block_avx + stride;
2598 blkA = _mm256_load_si256(avx_stride_p); // re-load accum
2599 }
2600 // set avx2 bits one by one
2601 mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);
2602 mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);
2603 mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);
2604 blkA = _mm256_or_si256(blkA, mask_tmp);
2605 } // for j
2606 }
2607 } // for k
2608 _mm256_store_si256(avx_stride_p, blkA);
2609
2610 // set the tail bits conventionally
2611 for (; k < len; ++k)
2612 {
2613 unsigned n = idx[k];
2614 unsigned nbit = unsigned(n & bm::set_block_mask);
2615 unsigned nword = nbit >> bm::set_word_shift;
2616 nbit &= bm::set_word_mask;
2617 block[nword] |= (1u << nbit);
2618 } // for k
2619}
2620
2621
2622/**
2623 Experiemntal. Set number of bits in AVX register from 0 to i
2624 [ 000000 00000 0000000 00011 11111 ] - i = 7
2625*/
2626inline
2627__m256i avx2_setbit_to256(unsigned i)
2628{
2629 __m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2630 __m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));
2631 __m256i maskFF = _mm256_set1_epi32(-1);
2632 __m256i maskZ = _mm256_setzero_si256();
2633
2634 __m256i v0 = _mm256_set1_epi32(i);
2635 __m256i s0 = _mm256_sub_epi32(v0, stride_idx1);
2636 __m256i k1 = _mm256_sllv_epi32(maskFF, s0);
2637
2638 {
2639 __m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);
2640 cmp_eq = _mm256_xor_si256(maskFF, cmp_eq); // invert: != 0 mask
2641 k1 = _mm256_xor_si256(k1, cmp_eq); // [ 0 0 0 0 0 0 3 0 ]
2642 }
2643
2644 __m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);
2645 cmp_gt = _mm256_xor_si256(maskFF, cmp_gt); // invert as GT == LT|EQ (LE)
2646 __m256i r = _mm256_xor_si256(k1, cmp_gt); // invert all full words (right)
2647
2648 return r;
2649}
2650
2651
2652
2653/**
2654 Experimental (test) function to do SIMD vector search (lower bound)
2655 in sorted, growing array
2656 @ingroup AVX2
2657
2658 \internal
2659*/
2660inline
2661int avx2_cmpge_u32(__m256i vect8, unsigned value)
2662{
2663 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
2664 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
2665 //
2666 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2667 __m256i mm_val = _mm256_set1_epi32(value);
2668
2669 __m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8); // (signed) vect4 - 0x80000000
2670 __m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
2671
2672 __m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);
2673 __m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);
2674
2675 __m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);
2676 int mask = _mm256_movemask_epi8(cmp_mask_ge);
2677 if (mask)
2678 {
2679 int bsf = bm::bsf_asm32(mask); // could use lzcnt()
2680 return bsf / 4;
2681 }
2682 return -1;
2683}
2684
2685/**
2686 Experimental (test) function to do SIMD vector search
2687 in sorted, growing array
2688 @ingroup AVX2
2689
2690 \internal
2691*/
2692inline
2693int avx2_cmpge_u16(__m256i vect16, unsigned short value)
2694{
2695 __m256i mZ = _mm256_setzero_si256();
2696 __m256i mVal = _mm256_set1_epi16(value);
2697
2698 // subs_epu16 - unsigned substration with saturation, gives 0u if (a - b) < 0
2699 __m256i mSub = _mm256_subs_epu16(mVal, vect16);
2700 __m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2701 unsigned mask = _mm256_movemask_epi8(mge_mask);
2702 if (mask)
2703 {
2704 int lz = _tzcnt_u32(mask);
2705 return lz / 2;
2706 }
2707 return -1;
2708}
2709
2710/**
2711 Hybrid binary search, starts as binary, then switches to scan
2712
2713 NOTE: AVX code uses _mm256_subs_epu16 - saturated substraction
2714 which gives 0 if A-B=0 if A < B (not negative a value).
2715
2716 \param buf - GAP buffer pointer.
2717 \param pos - index of the element.
2718 \param is_set - output. GAP value (0 or 1).
2719 \return GAP index.
2720
2721 @ingroup AVX2
2722*/
2723inline
2724unsigned avx2_gap_bfind(const unsigned short* BMRESTRICT buf,
2725 unsigned pos, unsigned* BMRESTRICT is_set)
2726{
2727 BM_ASSERT(is_set);
2728
2729 const unsigned linear_cutoff = 48;
2730 const unsigned unroll_factor = 16;
2731
2733
2734 unsigned res;
2735 unsigned start = 1;
2736 unsigned end = 1 + ((*buf) >> 3);
2737 unsigned arr_end = end;
2738
2739 if (end - start < unroll_factor) // too small for a full AVX stride
2740 {
2741 for (; start < end; ++start)
2742 {
2743 if (buf[start] >= pos)
2744 {
2745 res = ((*buf) & 1) ^ ((start-1) & 1);
2746 *is_set = res;
2747 return start;
2748 }
2749 } // for
2750 BM_ASSERT(0);
2751 }
2752
2753 while (start != end)
2754 {
2755 unsigned dsize = end - start;
2756 if (dsize < linear_cutoff)
2757 {
2758 // set wider scan window to possibly over-read the range,
2759 // but stay within allocated block memory
2760 //
2761 dsize = arr_end - start;
2762
2763 __m256i mZ = _mm256_setzero_si256();
2764 __m256i mPos = _mm256_set1_epi16((unsigned short)pos);
2765 __m256i vect16, mSub, mge_mask;
2766
2767 unsigned len_unr = start + (dsize - (dsize % unroll_factor));
2768 for (; start < len_unr; start += unroll_factor)
2769 {
2770 vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); // 16x u16s
2771 mSub = _mm256_subs_epu16(mPos, vect16);
2772 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2773 int mask = _mm256_movemask_epi8(mge_mask);
2774 if (mask)
2775 {
2776 int lz = _tzcnt_u32(mask) / 2;
2777 start += lz;
2778 res = ((*buf) & 1) ^ ((start-1) & 1);
2779 *is_set = res;
2780 return start;
2781 }
2782 } // for k
2783 unsigned tail = unroll_factor - (end - start);
2784 if (start > tail+1)
2785 {
2786 start -= tail; // rewind back, but stay within block
2787 vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); // 16x u16s
2788 mSub = _mm256_subs_epu16(mPos, vect16);
2789 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2790 int mask = _mm256_movemask_epi8(mge_mask);
2791 BM_ASSERT(mask); // the rersult MUST be here at this point
2792
2793 int lz = _tzcnt_u32(mask) / 2;
2794 start += lz;
2795 res = ((*buf) & 1) ^ ((start-1) & 1);
2796 *is_set = res;
2797 return start;
2798 }
2799 for (; start < end; ++start)
2800 {
2801 if (buf[start] >= pos)
2802 break;
2803 } // for
2804 break;
2805 }
2806 unsigned curr = (start + end) >> 1;
2807 if (buf[curr] < pos)
2808 start = curr + 1;
2809 else
2810 end = curr;
2811 } // while
2812 res = ((*buf) & 1) ^ ((start-1) & 1);
2813 *is_set = res;
2814 return start;
2815}
2816
2817
2818/**
2819 Hybrid binary search, starts as binary, then switches to scan
2820 @ingroup AVX2
2821*/
2822inline
2823unsigned avx2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
2824{
2825 unsigned is_set;
2826 bm::avx2_gap_bfind(buf, pos, &is_set);
2827 return is_set;
2828}
2829
2830/**
2831 lower bound (great or equal) linear scan in ascending order sorted array
2832 @ingroup AVX2
2833 \internal
2834*/
2835inline
2836unsigned avx2_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
2837 unsigned target,
2838 unsigned from,
2839 unsigned to)
2840{
2841 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
2842 // see more at:
2843 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
2844
2845 const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
2846
2847 unsigned unroll_factor = 8;
2848 unsigned len = to - from + 1;
2849 unsigned len_unr = len - (len % unroll_factor);
2850
2851 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2852 __m256i vect_target = _mm256_set1_epi32(target);
2853 __m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
2854
2855 int mask;
2856 __m256i vect80, norm_vect80, cmp_mask_ge;
2857
2858 unsigned k = 0;
2859 for (; k < len_unr; k += unroll_factor)
2860 {
2861 vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k])); // 8 u32s
2862 norm_vect80 = _mm256_sub_epi32(vect80, mask0x8); // (signed) vect4 - 0x80000000
2863
2864 cmp_mask_ge = _mm256_or_si256( // GT | EQ
2865 _mm256_cmpgt_epi32(norm_vect80, norm_target),
2866 _mm256_cmpeq_epi32(vect80, vect_target)
2867 );
2868 mask = _mm256_movemask_epi8(cmp_mask_ge);
2869 if (mask)
2870 {
2871 int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
2872 return from + k + (bsf / 4);
2873 }
2874 } // for
2875
2876 for (; k < len; ++k)
2877 {
2878 if (arr_base[k] >= target)
2879 return from + k;
2880 }
2881 return to + 1;
2882}
2883
2884
2885/*!
2886 AVX2 bit block gather-scatter
2887
2888 @param arr - destination array to set bits
2889 @param blk - source bit-block
2890 @param idx - gather index array
2891 @param size - gather array size
2892 @param start - gaher start index
2893 @param bit_idx - bit to set in the target array
2894
2895 \internal
2896
2897 C algorithm:
2898
2899 for (unsigned k = start; k < size; ++k)
2900 {
2901 nbit = unsigned(idx[k] & bm::set_block_mask);
2902 nword = unsigned(nbit >> bm::set_word_shift);
2903 mask0 = 1u << (nbit & bm::set_word_mask);
2904 arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
2905 }
2906
2907*/
2908inline
2910 const unsigned* BMRESTRICT blk,
2911 const unsigned* BMRESTRICT idx,
2912 unsigned size,
2913 unsigned start,
2914 unsigned bit_idx)
2915{
2916 const unsigned unroll_factor = 8;
2917 const unsigned len = (size - start);
2918 const unsigned len_unr = len - (len % unroll_factor);
2919
2920 __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2921 __m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
2922 __m256i maskFF = _mm256_set1_epi32(~0u);
2923
2924 __m256i mask_tmp, mask_0;
2925
2926 unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
2927
2928 unsigned k = 0, mask, w_idx;
2929 for (; k < len_unr; k+=unroll_factor)
2930 {
2931 __m256i nbitA, nwordA;
2932 const unsigned base = start + k;
2933 __m256i* idx_ptr = (__m256i*)(idx+base); // idx[base]
2934
2935 nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
2936 nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
2937
2938 // shufffle + permute to prepare comparison vector
2939 mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
2940 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2941 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2942 _mm256_store_si256((__m256i*)mword_v, nwordA);
2943
2944 if (mask == ~0u) // all idxs belong the same word avoid (costly) gather
2945 {
2946 w_idx = mword_v[0];
2947 mask_tmp = _mm256_set1_epi32(blk[w_idx]); // use broadcast
2948 }
2949 else // gather for: blk[nword] (.. & mask0 )
2950 {
2951 mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
2952 blk[mword_v[5]], blk[mword_v[4]],
2953 blk[mword_v[3]], blk[mword_v[2]],
2954 blk[mword_v[1]], blk[mword_v[0]]);
2955 }
2956
2957 // mask0 = 1u << (nbit & bm::set_word_mask);
2958 //
2959 __m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
2960 __m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
2961 mask_0 = _mm256_sllv_epi32(mask1, shiftA);
2962
2963 mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
2964 if (!_mm256_testz_si256(mask_tmp, mask_tmp)) // AND tests empty
2965 {
2966 __m256i* target_ptr = (__m256i*)(arr+base); // arr[base]
2967 // bool(blk[nword] ... )
2968 __m256i maskZ = _mm256_xor_si256(maskFF, maskFF); // all zero
2969 mask1 = _mm256_slli_epi32(mask1, bit_idx); // << bit_idx
2970 mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF if==0
2971 mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF); // invert
2972 mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
2973 _mm256_storeu_si256 (target_ptr, // arr[base] |= MASK_EXPR
2974 _mm256_or_si256 (mask_tmp,
2975 _mm256_loadu_si256(target_ptr)));
2976 }
2977
2978 } // for
2979
2980 for (; k < len; ++k)
2981 {
2982 const unsigned base = start + k;
2983 unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
2984 arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
2985 }
2986
2987}
2988
2989/**
2990 Convert bit block to GAP block
2991 @ingroup AVX2
2992 \internal
2993*/
2994inline
2996 const unsigned* BMRESTRICT block,
2997 unsigned dest_len)
2998{
2999 const unsigned* BMRESTRICT block_end = block + bm::set_block_size;
3000 gap_word_t* BMRESTRICT pcurr = dest;
3001 gap_word_t* BMRESTRICT end = dest + dest_len; (void)end;
3002
3003 unsigned bitval = (*block) & 1u;
3004 *pcurr++ = bm::gap_word_t(bitval);
3005 *pcurr = 0;
3006 unsigned bit_idx = 0;
3007
3008 const unsigned vCAP = 64; // 64-bit system
3009 __m256i maskZ = _mm256_set1_epi32(0);
3010
3011 for (; block < block_end; block += 8)
3012 {
3013 unsigned k = 0;
3014 if (!bitval)
3015 {
3016 // check number of trailing 64-bit words using AVX compare
3017 __m256i accA = _mm256_load_si256((__m256i*)block); // 4x u64s
3018 __m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);
3019 unsigned mask = ~~_mm256_movemask_epi8(cmpA);
3020 if (!mask)
3021 {
3022 bit_idx += 256;
3023 continue;
3024 }
3025 unsigned w64_idx = _tzcnt_u32(mask);
3026 k = w64_idx / 8; // 8 byte word offset
3027 bit_idx += k * vCAP;
3028 }
3029
3030 for (; k < 4; ++k)
3031 {
3032 bm::id64_t val = (((bm::id64_t*)block)[k]);
3033
3034 if (!val || val == ~0ull)
3035 {
3036 // branchless if
3037 bool cmp = (bool(bitval) != bool(val));
3038 unsigned mask = ~(cmp - 1u);
3039 *pcurr = mask & (gap_word_t)(bit_idx-cmp);
3040 bitval ^= unsigned(cmp);
3041 unsigned long long pcu = reinterpret_cast<unsigned long long>(pcurr);
3042 pcu += mask & sizeof(gap_word_t);
3043 pcurr = reinterpret_cast<gap_word_t*>(pcu);
3044 bit_idx += vCAP;
3045 continue;
3046 } // while
3047
3048
3049 // process "0100011" word
3050 //
3051 unsigned bits_consumed = 0;
3052 do
3053 {
3054 unsigned tz = 1u;
3055 if (bitval != (val & tz))
3056 {
3057 bitval ^= tz;
3058 *pcurr++ = (gap_word_t)(bit_idx-tz);
3059
3060 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3061 BM_ASSERT(pcurr != end);
3062 }
3063 else // match, find the next idx
3064 {
3065 tz = (unsigned)_tzcnt_u64(bitval ? ~val : val);
3066 }
3067
3068 bool cmp = ((bits_consumed+=tz) < vCAP);
3069 bit_idx += tz;
3070 val >>= tz;
3071
3072 if (!val)
3073 {
3074 tz = ~(cmp - 1u); // generate 0xFFFF or 0x0000 mask
3075 *pcurr = tz & (gap_word_t)(bit_idx-cmp);
3076 bitval ^= unsigned(cmp);
3077 bit_idx += tz & (vCAP - bits_consumed);
3078 unsigned long long pcu = reinterpret_cast<unsigned long long>(pcurr);
3079 pcu += tz & sizeof(gap_word_t);
3080 pcurr = reinterpret_cast<gap_word_t*>(pcu);
3081
3082 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3083 BM_ASSERT(pcurr != end);
3084 break;
3085 }
3086 } while (1);
3087 } // for k
3088
3089 } // for block < end
3090
3091 *pcurr = (gap_word_t)(bit_idx-1);
3092 unsigned len = (unsigned)(pcurr - dest);
3093 *dest = (gap_word_t)((*dest & 7) + (len << 3));
3094 return len;
3095}
3096
3097/**
3098 Build partial XOR product of 2 bit-blocks using digest mask
3099
3100 @param target_block - target := block ^ xor_block
3101 @param block - arg1
3102 @param xor_block - arg2
3103 @param digest - mask for each block wave to XOR (1) or just copy (0)
3104
3105 @ingroup AVX2
3106 @internal
3107*/
3108inline
3110 const bm::word_t* block, const bm::word_t* xor_block,
3111 bm::id64_t digest)
3112{
3113 for (unsigned i = 0; i < bm::block_waves; ++i)
3114 {
3115 const bm::id64_t mask = (1ull << i);
3116 unsigned off = (i * bm::set_block_digest_wave_size);
3117 const __m256i* sub_block = (__m256i*) (block + off);
3118 __m256i* t_sub_block = (__m256i*)(target_block + off);
3119
3120 if (digest & mask) // XOR filtered sub-block
3121 {
3122 const __m256i* xor_sub_block = (__m256i*) (xor_block + off);
3123 __m256i mA, mB, mC, mD;
3124 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3125 _mm256_load_si256(xor_sub_block));
3126 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3127 _mm256_load_si256(xor_sub_block+1));
3128 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3129 _mm256_load_si256(xor_sub_block+2));
3130 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3131 _mm256_load_si256(xor_sub_block+3));
3132
3133 _mm256_store_si256(t_sub_block, mA);
3134 _mm256_store_si256(t_sub_block+1, mB);
3135 _mm256_store_si256(t_sub_block+2, mC);
3136 _mm256_store_si256(t_sub_block+3, mD);
3137 }
3138 else // just copy source
3139 {
3140 _mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));
3141 _mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));
3142 _mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));
3143 _mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));
3144 }
3145 } // for i
3146}
3147
3148
3149/**
3150 Build partial XOR product of 2 bit-blocks using digest mask
3151
3152 @param target_block - target ^= xor_block
3153 @param xor_block - arg1
3154 @param digest - mask for each block wave to XOR (1)
3155
3156 @ingroup AVX2
3157 @internal
3158*/
3159inline
3161 const bm::word_t* xor_block,
3162 bm::id64_t digest) BMNOEXCEPT
3163{
3164 while (digest)
3165 {
3166 bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
3167 unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
3168 unsigned off = wave * bm::set_block_digest_wave_size;
3169
3170 const __m256i* sub_block = (const __m256i*) (xor_block + off);
3171 __m256i* t_sub_block = (__m256i*)(target_block + off);
3172
3173 __m256i mA, mB, mC, mD;
3174 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3175 _mm256_load_si256(t_sub_block));
3176 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3177 _mm256_load_si256(t_sub_block+1));
3178 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3179 _mm256_load_si256(t_sub_block+2));
3180 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3181 _mm256_load_si256(t_sub_block+3));
3182
3183 _mm256_store_si256(t_sub_block, mA);
3184 _mm256_store_si256(t_sub_block+1, mB);
3185 _mm256_store_si256(t_sub_block+2, mC);
3186 _mm256_store_si256(t_sub_block+3, mD);
3187
3188 digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
3189 } // while
3190
3191}
3192
3193
3194
3195#ifdef __GNUG__
3196#pragma GCC diagnostic pop
3197#endif
3198
3199
3200#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
3201 avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3202
3203#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
3204 avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3205
3206#define VECT_BITCOUNT(first, last) \
3207 avx2_bit_count((__m256i*) (first), (__m256i*) (last))
3208
3209#define VECT_BITCOUNT_AND(first, last, mask) \
3210 avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3211
3212#define VECT_BITCOUNT_OR(first, last, mask) \
3213 avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3214
3215#define VECT_BITCOUNT_XOR(first, last, mask) \
3216 avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3217
3218#define VECT_BITCOUNT_SUB(first, last, mask) \
3219 avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3220
3221#define VECT_INVERT_BLOCK(first) \
3222 avx2_invert_block((__m256i*)first);
3223
3224#define VECT_AND_BLOCK(dst, src) \
3225 avx2_and_block((__m256i*) dst, (const __m256i*) (src))
3226
3227#define VECT_AND_DIGEST(dst, src) \
3228 avx2_and_digest((__m256i*) dst, (const __m256i*) (src))
3229
3230#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
3231 avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3232
3233#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
3234 avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3235
3236#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3237 avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3238
3239#define VECT_OR_BLOCK(dst, src) \
3240 avx2_or_block((__m256i*) dst, (__m256i*) (src))
3241
3242#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3243 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3244
3245#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
3246 avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3247
3248#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3249 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3250
3251#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
3252 avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4))
3253
3254#define VECT_SUB_BLOCK(dst, src) \
3255 avx2_sub_block((__m256i*) dst, (__m256i*) (src))
3256
3257#define VECT_SUB_DIGEST(dst, src) \
3258 avx2_sub_digest((__m256i*) dst, (const __m256i*) (src))
3259
3260#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
3261 avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3262
3263#define VECT_XOR_BLOCK(dst, src) \
3264 avx2_xor_block((__m256i*) dst, (__m256i*) (src))
3265
3266#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
3267 avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3268
3269#define VECT_COPY_BLOCK(dst, src) \
3270 avx2_copy_block((__m256i*) dst, (__m256i*) (src))
3271
3272#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
3273 avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src))
3274
3275#define VECT_STREAM_BLOCK(dst, src) \
3276 avx2_stream_block((__m256i*) dst, (__m256i*) (src))
3277
3278#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
3279 avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src))
3280
3281#define VECT_SET_BLOCK(dst, value) \
3282 avx2_set_block((__m256i*) dst, (value))
3283
3284#define VECT_IS_ZERO_BLOCK(dst) \
3285 avx2_is_all_zero((__m256i*) dst)
3286
3287#define VECT_IS_ONE_BLOCK(dst) \
3288 avx2_is_all_one((__m256i*) dst)
3289
3290#define VECT_IS_DIGEST_ZERO(start) \
3291 avx2_is_digest_zero((__m256i*)start)
3292
3293#define VECT_BLOCK_SET_DIGEST(dst, val) \
3294 avx2_block_set_digest((__m256i*)dst, val)
3295
3296#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
3297 avx2_lower_bound_scan_u32(arr, target, from, to)
3298
3299#define VECT_SHIFT_L1(b, acc, co) \
3300 avx2_shift_l1((__m256i*)b, acc, co)
3301
3302#define VECT_SHIFT_R1(b, acc, co) \
3303 avx2_shift_r1((__m256i*)b, acc, co)
3304
3305#define VECT_SHIFT_R1_AND(b, co, m, digest) \
3306 avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest)
3307
3308#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
3309 avx2_idx_arr_block_lookup(idx, size, nb, start)
3310
3311#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
3312 avx2_set_block_bits3(block, idx, start, stop)
3313
3314#define VECT_BLOCK_CHANGE(block, size) \
3315 avx2_bit_block_calc_change((__m256i*)block, size)
3316
3317#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
3318 avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc)
3319
3320#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
3321 avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc)
3322
3323#define VECT_BIT_TO_GAP(dest, src, dest_len) \
3324 avx2_bit_to_gap(dest, src, dest_len)
3325
3326#define VECT_BIT_FIND_FIRST(src1, pos) \
3327 avx2_bit_find_first((__m256i*) src1, pos)
3328
3329#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
3330 avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos)
3331
3332#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
3333 avx2_bit_block_xor(t, src, src_xor, d)
3334
3335#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
3336 avx2_bit_block_xor_2way(t, src_xor, d)
3337
3338#define VECT_GAP_BFIND(buf, pos, is_set) \
3339 avx2_gap_bfind(buf, pos, is_set)
3340
3341#define VECT_BIT_COUNT_DIGEST(blk, d) \
3342 avx2_bit_block_count(blk, d)
3343
3344
3345} // namespace
3346
3347
3348
3349
3350#endif
#define BM_AVX2_POPCNT_PROLOG
Definition: bmavx2.h:140
#define BM_CSA256(h, l, a, b, c)
Definition: bmavx2.h:117
#define BM_AVX2_BIT_COUNT(ret, v)
Definition: bmavx2.h:124
Definitions(internal)
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN32
Definition: bmdef.h:292
#define BMFORCEINLINE
Definition: bmdef.h:213
#define BM_ASSERT
Definition: bmdef.h:139
#define BM_ALIGN32ATTR
Definition: bmdef.h:293
Bit manipulation primitives (internal)
unsigned avx2_xor_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
XOR block against another dst ^= *src.
Definition: bmavx2.h:1060
bool avx2_sub_digest(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmavx2.h:1202
bool avx2_and_digest_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
AND block digest stride.
Definition: bmavx2.h:659
bool avx2_or_block_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
Definition: bmavx2.h:991
void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmavx2.h:3160
void avx2_bit_block_calc_change_bc(const __m256i *BMRESTRICT block, unsigned *gcount, unsigned *bcount)
Definition: bmavx2.h:2038
bm::id_t avx2_bit_count_sub(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
AND NOT bit count for two aligned bit-blocks.
Definition: bmavx2.h:413
bool avx2_or_block_3way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition: bmavx2.h:939
bm::id_t avx2_bit_count_xor(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
XOR bit count for two aligned bit-blocks.
Definition: bmavx2.h:368
bool avx2_and_digest(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmavx2.h:543
BMFORCEINLINE bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmavx2.h:1592
void avx2_copy_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy dst = *src.
Definition: bmavx2.h:1290
BMFORCEINLINE bool avx2_test_all_one_wave(const void *ptr)
check if wave of pointers is all 0xFFF
Definition: bmavx2.h:1578
bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift right by 1
Definition: bmavx2.h:1690
bool avx2_is_digest_zero(const __m256i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmavx2.h:1525
bool avx2_or_arr_unal(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end)
OR array elements against another unaligned array dst |= *src.
Definition: bmavx2.h:840
void avx2_bit_block_calc_xor_change(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gcount, unsigned *BMRESTRICT bcount)
Definition: bmavx2.h:1941
bool avx2_and_or_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND-OR block digest stride 2 way dst |= *src1 & *src2.
Definition: bmavx2.h:604
bool avx2_bit_find_first_diff(const __m256i *BMRESTRICT block1, const __m256i *BMRESTRICT block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
Definition: bmavx2.h:2123
unsigned avx2_bit_to_gap(gap_word_t *BMRESTRICT dest, const unsigned *BMRESTRICT block, unsigned dest_len)
Convert bit block to GAP block.
Definition: bmavx2.h:2995
BMFORCEINLINE void avx2_set_block(__m256i *BMRESTRICT dst, bm::word_t value)
AVX2 block memset dst = value.
Definition: bmavx2.h:1264
bool avx2_shift_r1_and(__m256i *BMRESTRICT block, bm::word_t co1, const __m256i *BMRESTRICT mask_block, bm::id64_t *BMRESTRICT digest)
fused block shift right by 1 plus AND
Definition: bmavx2.h:1746
void avx2_copy_block_unalign(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy (unaligned SRC) dst = *src.
Definition: bmavx2.h:1332
bool avx2_or_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
OR array elements against another array dst |= *src.
Definition: bmavx2.h:787
void avx2_stream_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy dst = *src.
Definition: bmavx2.h:1376
unsigned avx2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmavx2.h:2823
bm::id_t avx2_bit_count_and(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
AND bit count for two aligned bit-blocks.
Definition: bmavx2.h:290
void avx2_andnot_arr_2_mask(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition: bmavx2.h:472
void avx2_block_set_digest(__m256i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition: bmavx2.h:1539
unsigned avx2_sub_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition: bmavx2.h:1156
bool avx2_is_all_zero(const __m256i *BMRESTRICT block)
check if block is all zero bits
Definition: bmavx2.h:1495
BMFORCEINLINE bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
Definition: bmavx2.h:1603
bool avx2_and_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND block digest stride 2 way dst = *src1 & *src2.
Definition: bmavx2.h:573
bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)
Calculate population count based on digest.
Definition: bmavx2.h:232
void avx2_stream_block_unalign(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy (unaligned SRC) dst = *src.
Definition: bmavx2.h:1418
BMFORCEINLINE bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
Definition: bmavx2.h:1616
unsigned avx2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmavx2.h:2724
unsigned avx2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmavx2.h:2836
bool avx2_bit_find_first(const __m256i *BMRESTRICT block, unsigned *pos)
Find first bit set.
Definition: bmavx2.h:2181
void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmavx2.h:3109
bool avx2_is_all_one(const __m256i *BMRESTRICT block)
check if block is all one bits
Definition: bmavx2.h:1554
unsigned avx2_bit_block_calc_change(const __m256i *BMRESTRICT block, unsigned size)
Definition: bmavx2.h:1870
unsigned avx2_and_arr_unal(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end)
AND array elements against another array (unaligned) dst &= *src.
Definition: bmavx2.h:729
void avx2_invert_block(__m256i *BMRESTRICT dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
Definition: bmavx2.h:1464
void avx2_xor_arr_2_mask(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
Definition: bmavx2.h:447
unsigned avx2_xor_block_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
3 operand XOR dst = *src1 ^ src2
Definition: bmavx2.h:1106
bool avx2_sub_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2
Definition: bmavx2.h:1232
unsigned avx2_and_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND array elements against another array dst &= *src.
Definition: bmavx2.h:496
bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift left by 1
Definition: bmavx2.h:1629
int avx2_cmpge_u16(__m256i vect16, unsigned short value)
Experimental (test) function to do SIMD vector search in sorted, growing array.
Definition: bmavx2.h:2693
int avx2_cmpge_u32(__m256i vect8, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition: bmavx2.h:2661
bm::id_t avx2_bit_count(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...
Definition: bmavx2.h:156
bool avx2_or_block_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
OR 2 arrays and copy to the destination dst = *src1 | src2.
Definition: bmavx2.h:893
Definition: bm.h:78
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
void avx2_set_block_bits3(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
Definition: bmavx2.h:2534
unsigned int word_t
Definition: bmconst.h:39
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmavx2.h:2278
__m256i avx2_setbit_to256(unsigned i)
Experiemntal.
Definition: bmavx2.h:2627
const unsigned set_block_mask
Definition: bmconst.h:57
bm::id_t avx2_bit_count_or(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
Definition: bmavx2.h:337
void avx2_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition: bmavx2.h:2319
const unsigned set_word_shift
Definition: bmconst.h:72
const unsigned set_block_size
Definition: bmconst.h:55
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
Definition: bmavx2.h:2239
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
unsigned int id_t
Definition: bmconst.h:38
BMFORCEINLINE __m256i avx2_setbit_256(__m256i target, __m256i source)
Set a bits in an AVX target, by indexes (int4) from the source.
Definition: bmavx2.h:2448
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
Definition: bmutil.h:335
void avx2_set_block_bits2(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
Definition: bmavx2.h:2495
unsigned short gap_word_t
Definition: bmconst.h:78
const unsigned gap_max_bits
Definition: bmconst.h:81
const unsigned set_block_shift
Definition: bmconst.h:56
void avx2_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmavx2.h:2909
const unsigned set_word_mask
Definition: bmconst.h:73
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345