1#ifndef BMAVX2__H__INCLUDED__
2#define BMAVX2__H__INCLUDED__
79void avx2_print256_u32(
const char* prefix,
const __m256i & value)
81 const size_t n =
sizeof(__m256i) /
sizeof(
unsigned);
83 _mm256_storeu_si256((__m256i*)buffer, value);
84 std::cout << prefix <<
" [ ";
85 for (
int i = n-1; 1; --i)
87 std::cout << std::hex << buffer[i] <<
" ";
91 std::cout <<
"]" << std::endl;
95void avx2_print256_u16(
const char* prefix,
const __m256i & value)
97 const size_t n =
sizeof(__m256i) /
sizeof(
unsigned short);
98 unsigned short buffer[n];
99 _mm256_storeu_si256((__m256i*)buffer, value);
100 std::cout << prefix <<
" [ ";
101 for (
int i = n-1; 1; --i)
103 std::cout << buffer[i] <<
" ";
107 std::cout <<
"]" << std::endl;
112#pragma GCC diagnostic push
113#pragma GCC diagnostic ignored "-Wconversion"
117#define BM_CSA256(h, l, a, b, c) \
119 __m256i u = _mm256_xor_si256(a, b); \
120 h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \
121 l = _mm256_xor_si256(u, c); \
124#define BM_AVX2_BIT_COUNT(ret, v) \
126 __m256i lo = _mm256_and_si256(v, low_mask); \
127 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \
128 __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \
129 __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \
130 ret = _mm256_sad_epu8(cnt1, cnt2); \
133#define BM_AVX2_DECL_LOOKUP1 \
134 __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \
135 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
136#define BM_AVX2_DECL_LOOKUP2 \
137__m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \
138 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
140#define BM_AVX2_POPCNT_PROLOG \
141 BM_AVX2_DECL_LOOKUP1 \
142 BM_AVX2_DECL_LOOKUP2 \
143 __m256i low_mask = _mm256_set1_epi8(0x0f); \
159 __m256i cnt = _mm256_setzero_si256();
160 __m256i ones = _mm256_setzero_si256();
161 __m256i twos = _mm256_setzero_si256();
162 __m256i fours = _mm256_setzero_si256();
163 __m256i eights = _mm256_setzero_si256();
164 __m256i sixteens = _mm256_setzero_si256();
165 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
173 b = _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
176 b = _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
178 BM_CSA256(foursA, twos, twos, twosA, twosB);
180 b = _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
183 b = _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
185 BM_CSA256(foursB, twos, twos, twosA, twosB);
186 BM_CSA256(eightsA, fours, fours, foursA, foursB);
188 b = _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
191 b = _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
193 BM_CSA256(foursA, twos, twos, twosA, twosB);
195 b = _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
198 b = _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
200 BM_CSA256(foursB, twos, twos, twosA, twosB);
201 BM_CSA256(eightsB, fours, fours, foursA, foursB);
202 BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
205 cnt = _mm256_add_epi64(cnt, bc);
208 }
while (block < block_end);
210 cnt = _mm256_slli_epi64(cnt, 4);
212 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 3));
214 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 2));
216 cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 1));
218 cnt = _mm256_add_epi64(cnt, bc);
222 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
238 __m256i cnt = _mm256_setzero_si256();
243 unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
246 const __m256i*
BMRESTRICT wave_src = (__m256i*)&block[off];
248 __m256i m1A, m1B, m1C, m1D;
249 m1A = _mm256_load_si256(wave_src);
250 m1B = _mm256_load_si256(wave_src+1);
251 if (!_mm256_testz_si256(m1A, m1A))
254 cnt = _mm256_add_epi64(cnt, bc);
256 if (!_mm256_testz_si256(m1B, m1B))
259 cnt = _mm256_add_epi64(cnt, bc);
262 m1C = _mm256_load_si256(wave_src+2);
263 m1D = _mm256_load_si256(wave_src+3);
264 if (!_mm256_testz_si256(m1C, m1C))
267 cnt = _mm256_add_epi64(cnt, bc);
269 if (!_mm256_testz_si256(m1D, m1D))
272 cnt = _mm256_add_epi64(cnt, bc);
278 count = (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
296 __m256i cnt = _mm256_setzero_si256();
302 ymm0 = _mm256_load_si256(block);
303 ymm1 = _mm256_load_si256(mask_block);
304 ymm0 = _mm256_and_si256(ymm0, ymm1);
305 ++block; ++mask_block;
307 cnt = _mm256_add_epi64(cnt, bc);
309 ymm0 = _mm256_load_si256(block);
310 ymm1 = _mm256_load_si256(mask_block);
311 ymm0 = _mm256_and_si256(ymm0, ymm1);
312 ++block; ++mask_block;
314 cnt = _mm256_add_epi64(cnt, bc);
316 ymm0 = _mm256_load_si256(block);
317 ymm1 = _mm256_load_si256(mask_block);
318 ymm0 = _mm256_and_si256(ymm0, ymm1);
319 ++block; ++mask_block;
321 cnt = _mm256_add_epi64(cnt, bc);
323 ymm0 = _mm256_load_si256(block);
324 ymm1 = _mm256_load_si256(mask_block);
325 ymm0 = _mm256_and_si256(ymm0, ymm1);
326 ++block; ++mask_block;
328 cnt = _mm256_add_epi64(cnt, bc);
330 }
while (block < block_end);
333 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
343 __m256i cnt = _mm256_setzero_si256();
346 __m256i tmp0 = _mm256_load_si256(block);
347 __m256i tmp1 = _mm256_load_si256(mask_block);
349 tmp0 = _mm256_or_si256(tmp0, tmp1);
352 cnt = _mm256_add_epi64(cnt, bc);
354 ++block; ++mask_block;
356 }
while (block < block_end);
359 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
374 __m256i cnt = _mm256_setzero_si256();
375 __m256i mA, mB, mC, mD;
378 mA = _mm256_xor_si256(_mm256_load_si256(block+0),
379 _mm256_load_si256(mask_block+0));
381 cnt = _mm256_add_epi64(cnt, bc);
383 mB = _mm256_xor_si256(_mm256_load_si256(block+1),
384 _mm256_load_si256(mask_block+1));
386 cnt = _mm256_add_epi64(cnt, bc);
388 mC = _mm256_xor_si256(_mm256_load_si256(block+2),
389 _mm256_load_si256(mask_block+2));
391 cnt = _mm256_add_epi64(cnt, bc);
393 mD = _mm256_xor_si256(_mm256_load_si256(block+3),
394 _mm256_load_si256(mask_block+3));
396 cnt = _mm256_add_epi64(cnt, bc);
398 block += 4; mask_block += 4;
400 }
while (block < block_end);
403 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
419 __m256i cnt = _mm256_setzero_si256();
422 __m256i tmp0 = _mm256_load_si256(block);
423 __m256i tmp1 = _mm256_load_si256(mask_block);
425 tmp0 = _mm256_andnot_si256(tmp1, tmp0);
428 cnt = _mm256_add_epi64(cnt, bc);
430 ++block; ++mask_block;
432 }
while (block < block_end);
435 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
452 __m256i yM = _mm256_set1_epi32(
int(mask));
455 _mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM));
456 _mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));
457 _mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));
458 _mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));
461 }
while (src < src_end);
477 __m256i yM = _mm256_set1_epi32(
int(mask));
480 _mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM));
481 _mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));
482 _mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));
483 _mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));
486 }
while (src < src_end);
499 __m256i m1A, m1B, m1C, m1D;
500 __m256i accA, accB, accC, accD;
505 accA = accB = accC = accD = _mm256_setzero_si256();
509 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
510 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
511 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
512 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
514 _mm256_store_si256(dst+0, m1A);
515 _mm256_store_si256(dst+1, m1B);
516 _mm256_store_si256(dst+2, m1C);
517 _mm256_store_si256(dst+3, m1D);
519 accA = _mm256_or_si256(accA, m1A);
520 accB = _mm256_or_si256(accB, m1B);
521 accC = _mm256_or_si256(accC, m1C);
522 accD = _mm256_or_si256(accD, m1D);
526 }
while (src < src_end);
528 accA = _mm256_or_si256(accA, accB);
529 accC = _mm256_or_si256(accC, accD);
530 accA = _mm256_or_si256(accA, accC);
532 return !_mm256_testz_si256(accA, accA);
546 __m256i m1A, m1B, m1C, m1D;
548 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
549 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
550 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
551 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
553 _mm256_store_si256(dst+0, m1A);
554 _mm256_store_si256(dst+1, m1B);
555 _mm256_store_si256(dst+2, m1C);
556 _mm256_store_si256(dst+3, m1D);
558 m1A = _mm256_or_si256(m1A, m1B);
559 m1C = _mm256_or_si256(m1C, m1D);
560 m1A = _mm256_or_si256(m1A, m1C);
562 return _mm256_testz_si256(m1A, m1A);
577 __m256i m1A, m1B, m1C, m1D;
579 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
580 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
581 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
582 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
584 _mm256_store_si256(dst+0, m1A);
585 _mm256_store_si256(dst+1, m1B);
586 _mm256_store_si256(dst+2, m1C);
587 _mm256_store_si256(dst+3, m1D);
589 m1A = _mm256_or_si256(m1A, m1B);
590 m1C = _mm256_or_si256(m1C, m1D);
591 m1A = _mm256_or_si256(m1A, m1C);
593 return _mm256_testz_si256(m1A, m1A);
608 const __m256i maskF = _mm256_set1_epi32(~0u);
610 __m256i m1A, m1B, m1C, m1D;
612 __m256i mSA, mSB, mSC, mSD;
615 mSA = _mm256_load_si256(dst+0);
616 mSB = _mm256_load_si256(dst+1);
617 mACC1 = _mm256_and_si256(mSA, mSB);
619 mSC = _mm256_load_si256(dst+2);
620 mSD = _mm256_load_si256(dst+3);
622 mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));
624 mACC1 = _mm256_xor_si256(mACC1, maskF);
625 if (_mm256_testz_si256(mACC1, mACC1))
629 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
630 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
631 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
632 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
635 _mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));
636 bool all_z = _mm256_testz_si256(mACC1, mACC1);
640 m1A = _mm256_or_si256(mSA, m1A);
641 m1B = _mm256_or_si256(mSB, m1B);
642 m1C = _mm256_or_si256(mSC, m1C);
643 m1D = _mm256_or_si256(mSD, m1D);
645 _mm256_store_si256(dst+0, m1A);
646 _mm256_store_si256(dst+1, m1B);
647 _mm256_store_si256(dst+2, m1C);
648 _mm256_store_si256(dst+3, m1D);
665 __m256i m1A, m1B, m1C, m1D;
666 __m256i m1E, m1F, m1G, m1H;
669 __m256i s1_0, s2_0, s1_1, s2_1;
671 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
672 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
673 m1A = _mm256_and_si256(s1_0, s2_0);
674 m1B = _mm256_and_si256(s1_1, s2_1);
675 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
676 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
677 m1C = _mm256_and_si256(s1_0, s2_0);
678 m1D = _mm256_and_si256(s1_1, s2_1);
681 __m256i s3_0, s4_0, s3_1, s4_1;
683 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
684 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
685 m1E = _mm256_and_si256(s3_0, s4_0);
686 m1F = _mm256_and_si256(s3_1, s4_1);
688 m1A = _mm256_and_si256(m1A, m1E);
689 m1B = _mm256_and_si256(m1B, m1F);
691 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
692 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
693 m1G = _mm256_and_si256(s3_0, s4_0);
694 m1H = _mm256_and_si256(s3_1, s4_1);
698 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
700 m1C = _mm256_and_si256(m1C, m1G);
701 m1D = _mm256_and_si256(m1D, m1H);
702 m1A = _mm256_and_si256(m1A, dst0);
703 m1B = _mm256_and_si256(m1B, dst1);
705 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
707 m1C = _mm256_and_si256(m1C, dst0);
708 m1D = _mm256_and_si256(m1D, dst1);
710 _mm256_store_si256(dst + 0, m1A);
711 _mm256_store_si256(dst + 1, m1B);
712 _mm256_store_si256(dst + 2, m1C);
713 _mm256_store_si256(dst + 3, m1D);
715 m1A = _mm256_or_si256(m1A, m1B);
716 m1C = _mm256_or_si256(m1C, m1D);
717 m1A = _mm256_or_si256(m1A, m1C);
719 return _mm256_testz_si256(m1A, m1A);
733 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
734 __m256i accA, accB, accC, accD;
736 accA = _mm256_setzero_si256();
737 accB = _mm256_setzero_si256();
738 accC = _mm256_setzero_si256();
739 accD = _mm256_setzero_si256();
743 m1A = _mm256_loadu_si256(src+0);
744 m2A = _mm256_load_si256(dst+0);
745 m1A = _mm256_and_si256(m1A, m2A);
746 _mm256_store_si256(dst+0, m1A);
747 accA = _mm256_or_si256(accA, m1A);
749 m1B = _mm256_loadu_si256(src+1);
750 m2B = _mm256_load_si256(dst+1);
751 m1B = _mm256_and_si256(m1B, m2B);
752 _mm256_store_si256(dst+1, m1B);
753 accB = _mm256_or_si256(accB, m1B);
755 m1C = _mm256_loadu_si256(src+2);
756 m2C = _mm256_load_si256(dst+2);
757 m1C = _mm256_and_si256(m1C, m2C);
758 _mm256_store_si256(dst+2, m1C);
759 accC = _mm256_or_si256(accC, m1C);
761 m1D = _mm256_loadu_si256(src+3);
762 m2D = _mm256_load_si256(dst+3);
763 m1D = _mm256_and_si256(m1D, m2D);
764 _mm256_store_si256(dst+3, m1D);
765 accD = _mm256_or_si256(accD, m1D);
769 }
while (src < src_end);
771 accA = _mm256_or_si256(accA, accB);
772 accC = _mm256_or_si256(accC, accD);
773 accA = _mm256_or_si256(accA, accC);
775 return !_mm256_testz_si256(accA, accA);
790 __m256i m1A, m1B, m1C, m1D;
792 __m256i mAccF0 = _mm256_set1_epi32(~0u);
793 __m256i mAccF1 = _mm256_set1_epi32(~0u);
803 m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
804 m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
805 mAccF0 = _mm256_and_si256(mAccF0, m1A);
806 mAccF0 = _mm256_and_si256(mAccF0, m1B);
808 _mm256_stream_si256(dst, m1A);
809 _mm256_stream_si256(dst+1, m1B);
813 m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));
814 m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));
815 mAccF1 = _mm256_and_si256(mAccF1, m1C);
816 mAccF1 = _mm256_and_si256(mAccF1, m1D);
818 _mm256_stream_si256(dst2, m1C);
819 _mm256_stream_si256(dst2+1, m1D);
821 src2 += 2; dst2 += 2;
822 }
while (src2 < src_end);
824 __m256i maskF = _mm256_set1_epi32(~0u);
825 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
826 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
827 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
828 return (maskA == ~0u);
844 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
845 __m256i mAccF0 = _mm256_set1_epi32(~0u);
846 __m256i mAccF1 = _mm256_set1_epi32(~0u);
849 m1A = _mm256_loadu_si256(src+0);
850 m2A = _mm256_load_si256(dst+0);
851 m1A = _mm256_or_si256(m1A, m2A);
852 _mm256_store_si256(dst+0, m1A);
854 m1B = _mm256_loadu_si256(src+1);
855 m2B = _mm256_load_si256(dst+1);
856 m1B = _mm256_or_si256(m1B, m2B);
857 _mm256_store_si256(dst+1, m1B);
859 m1C = _mm256_loadu_si256(src+2);
860 m2C = _mm256_load_si256(dst+2);
861 m1C = _mm256_or_si256(m1C, m2C);
862 _mm256_store_si256(dst+2, m1C);
864 m1D = _mm256_loadu_si256(src+3);
865 m2D = _mm256_load_si256(dst+3);
866 m1D = _mm256_or_si256(m1D, m2D);
867 _mm256_store_si256(dst+3, m1D);
869 mAccF1 = _mm256_and_si256(mAccF1, m1C);
870 mAccF1 = _mm256_and_si256(mAccF1, m1D);
871 mAccF0 = _mm256_and_si256(mAccF0, m1A);
872 mAccF0 = _mm256_and_si256(mAccF0, m1B);
876 }
while (src < src_end);
878 __m256i maskF = _mm256_set1_epi32(~0u);
879 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
880 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
881 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
882 return (maskA == ~0u);
897 __m256i m1A, m1B, m1C, m1D;
898 __m256i mAccF0 = _mm256_set1_epi32(~0u);
899 __m256i mAccF1 = _mm256_set1_epi32(~0u);
905 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
906 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
907 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
908 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
910 _mm256_store_si256(dst+0, m1A);
911 _mm256_store_si256(dst+1, m1B);
912 _mm256_store_si256(dst+2, m1C);
913 _mm256_store_si256(dst+3, m1D);
915 mAccF1 = _mm256_and_si256(mAccF1, m1C);
916 mAccF1 = _mm256_and_si256(mAccF1, m1D);
917 mAccF0 = _mm256_and_si256(mAccF0, m1A);
918 mAccF0 = _mm256_and_si256(mAccF0, m1B);
920 src1 += 4; src2 += 4; dst += 4;
922 }
while (src1 < src_end1);
924 __m256i maskF = _mm256_set1_epi32(~0u);
925 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
926 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
927 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
928 return (maskA == ~0u);
943 __m256i m1A, m1B, m1C, m1D;
944 __m256i mAccF0 = _mm256_set1_epi32(~0u);
945 __m256i mAccF1 = _mm256_set1_epi32(~0u);
951 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
952 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
953 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
954 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
956 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
957 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
958 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
959 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
961 _mm256_store_si256(dst+0, m1A);
962 _mm256_store_si256(dst+1, m1B);
963 _mm256_store_si256(dst+2, m1C);
964 _mm256_store_si256(dst+3, m1D);
966 mAccF1 = _mm256_and_si256(mAccF1, m1C);
967 mAccF1 = _mm256_and_si256(mAccF1, m1D);
968 mAccF0 = _mm256_and_si256(mAccF0, m1A);
969 mAccF0 = _mm256_and_si256(mAccF0, m1B);
971 src1 += 4; src2 += 4; dst += 4;
973 }
while (src1 < src_end1);
975 __m256i maskF = _mm256_set1_epi32(~0u);
976 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
977 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
978 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
979 return (maskA == ~0u);
997 __m256i m1A, m1B, m1C, m1D;
998 __m256i mAccF0 = _mm256_set1_epi32(~0u);
999 __m256i mAccF1 = _mm256_set1_epi32(~0u);
1006 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1007 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1008 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1009 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1011 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1012 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1013 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1014 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1016 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));
1017 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));
1018 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));
1019 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));
1021 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));
1022 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));
1023 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));
1024 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));
1026 _mm256_stream_si256(dst+0, m1A);
1027 _mm256_stream_si256(dst+1, m1B);
1028 _mm256_stream_si256(dst+2, m1C);
1029 _mm256_stream_si256(dst+3, m1D);
1031 mAccF1 = _mm256_and_si256(mAccF1, m1C);
1032 mAccF1 = _mm256_and_si256(mAccF1, m1D);
1033 mAccF0 = _mm256_and_si256(mAccF0, m1A);
1034 mAccF0 = _mm256_and_si256(mAccF0, m1B);
1036 src1 += 4; src2 += 4;
1037 src3 += 4; src4 += 4;
1038 _mm_prefetch ((
const char*)src3, _MM_HINT_T0);
1039 _mm_prefetch ((
const char*)src4, _MM_HINT_T0);
1043 }
while (src1 < src_end1);
1045 __m256i maskF = _mm256_set1_epi32(~0u);
1046 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1047 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1048 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1049 return (maskA == ~0u);
1063 __m256i m1A, m1B, m1C, m1D;
1064 __m256i accA, accB, accC, accD;
1069 accA = accB = accC = accD = _mm256_setzero_si256();
1073 m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1074 m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1075 m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1076 m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1078 _mm256_store_si256(dst+0, m1A);
1079 _mm256_store_si256(dst+1, m1B);
1080 _mm256_store_si256(dst+2, m1C);
1081 _mm256_store_si256(dst+3, m1D);
1083 accA = _mm256_or_si256(accA, m1A);
1084 accB = _mm256_or_si256(accB, m1B);
1085 accC = _mm256_or_si256(accC, m1C);
1086 accD = _mm256_or_si256(accD, m1D);
1090 }
while (src < src_end);
1092 accA = _mm256_or_si256(accA, accB);
1093 accC = _mm256_or_si256(accC, accD);
1094 accA = _mm256_or_si256(accA, accC);
1096 return !_mm256_testz_si256(accA, accA);
1110 __m256i m1A, m1B, m1C, m1D;
1111 __m256i accA, accB, accC, accD;
1116 accA = accB = accC = accD = _mm256_setzero_si256();
1120 m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));
1121 m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));
1122 m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));
1123 m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));
1125 _mm256_store_si256(dst + 0, m1A);
1126 _mm256_store_si256(dst + 1, m1B);
1127 _mm256_store_si256(dst + 2, m1C);
1128 _mm256_store_si256(dst + 3, m1D);
1130 accA = _mm256_or_si256(accA, m1A);
1131 accB = _mm256_or_si256(accB, m1B);
1132 accC = _mm256_or_si256(accC, m1C);
1133 accD = _mm256_or_si256(accD, m1D);
1135 src1 += 4; src2 += 4; dst += 4;
1137 }
while (src1 < src1_end);
1139 accA = _mm256_or_si256(accA, accB);
1140 accC = _mm256_or_si256(accC, accD);
1141 accA = _mm256_or_si256(accA, accC);
1143 return !_mm256_testz_si256(accA, accA);
1159 __m256i m1A, m1B, m1C, m1D;
1160 __m256i accA, accB, accC, accD;
1162 accA = accB = accC = accD = _mm256_setzero_si256();
1169 m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
1170 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1171 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1172 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1174 _mm256_store_si256(dst+2, m1C);
1175 _mm256_store_si256(dst+3, m1D);
1176 _mm256_store_si256(dst+0, m1A);
1177 _mm256_store_si256(dst+1, m1B);
1179 accA = _mm256_or_si256(accA, m1A);
1180 accB = _mm256_or_si256(accB, m1B);
1181 accC = _mm256_or_si256(accC, m1C);
1182 accD = _mm256_or_si256(accD, m1D);
1185 }
while (src < src_end);
1187 accA = _mm256_or_si256(accA, accB);
1188 accC = _mm256_or_si256(accC, accD);
1189 accA = _mm256_or_si256(accA, accC);
1191 return !_mm256_testz_si256(accA, accA);
1205 __m256i m1A, m1B, m1C, m1D;
1207 m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1208 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1209 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1210 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1212 _mm256_store_si256(dst+0, m1A);
1213 _mm256_store_si256(dst+1, m1B);
1214 _mm256_store_si256(dst+2, m1C);
1215 _mm256_store_si256(dst+3, m1D);
1217 m1A = _mm256_or_si256(m1A, m1B);
1218 m1C = _mm256_or_si256(m1C, m1D);
1219 m1A = _mm256_or_si256(m1A, m1C);
1221 return _mm256_testz_si256(m1A, m1A);
1236 __m256i m1A, m1B, m1C, m1D;
1238 m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));
1239 m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));
1240 m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));
1241 m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));
1243 _mm256_store_si256(dst+0, m1A);
1244 _mm256_store_si256(dst+1, m1B);
1245 _mm256_store_si256(dst+2, m1C);
1246 _mm256_store_si256(dst+3, m1D);
1248 m1A = _mm256_or_si256(m1A, m1B);
1249 m1C = _mm256_or_si256(m1C, m1D);
1250 m1A = _mm256_or_si256(m1A, m1C);
1252 return _mm256_testz_si256(m1A, m1A);
1269 __m256i ymm0 = _mm256_set1_epi32(
int(value));
1272 _mm256_store_si256(dst, ymm0);
1273 _mm256_store_si256(dst+1, ymm0);
1274 _mm256_store_si256(dst+2, ymm0);
1275 _mm256_store_si256(dst+3, ymm0);
1278 }
while (dst < dst_end);
1293 __m256i ymm0, ymm1, ymm2, ymm3;
1300 ymm0 = _mm256_load_si256(src+0);
1301 ymm1 = _mm256_load_si256(src+1);
1302 ymm2 = _mm256_load_si256(src+2);
1303 ymm3 = _mm256_load_si256(src+3);
1305 _mm256_store_si256(dst+0, ymm0);
1306 _mm256_store_si256(dst+1, ymm1);
1307 _mm256_store_si256(dst+2, ymm2);
1308 _mm256_store_si256(dst+3, ymm3);
1310 ymm0 = _mm256_load_si256(src+4);
1311 ymm1 = _mm256_load_si256(src+5);
1312 ymm2 = _mm256_load_si256(src+6);
1313 ymm3 = _mm256_load_si256(src+7);
1315 _mm256_store_si256(dst+4, ymm0);
1316 _mm256_store_si256(dst+5, ymm1);
1317 _mm256_store_si256(dst+6, ymm2);
1318 _mm256_store_si256(dst+7, ymm3);
1322 }
while (src < src_end);
1335 __m256i ymm0, ymm1, ymm2, ymm3;
1342 ymm0 = _mm256_loadu_si256(src+0);
1343 ymm1 = _mm256_loadu_si256(src+1);
1344 ymm2 = _mm256_loadu_si256(src+2);
1345 ymm3 = _mm256_loadu_si256(src+3);
1347 _mm256_store_si256(dst+0, ymm0);
1348 _mm256_store_si256(dst+1, ymm1);
1349 _mm256_store_si256(dst+2, ymm2);
1350 _mm256_store_si256(dst+3, ymm3);
1352 ymm0 = _mm256_loadu_si256(src+4);
1353 ymm1 = _mm256_loadu_si256(src+5);
1354 ymm2 = _mm256_loadu_si256(src+6);
1355 ymm3 = _mm256_loadu_si256(src+7);
1357 _mm256_store_si256(dst+4, ymm0);
1358 _mm256_store_si256(dst+5, ymm1);
1359 _mm256_store_si256(dst+6, ymm2);
1360 _mm256_store_si256(dst+7, ymm3);
1364 }
while (src < src_end);
1379 __m256i ymm0, ymm1, ymm2, ymm3;
1386 ymm0 = _mm256_load_si256(src+0);
1387 ymm1 = _mm256_load_si256(src+1);
1388 ymm2 = _mm256_load_si256(src+2);
1389 ymm3 = _mm256_load_si256(src+3);
1391 _mm256_stream_si256(dst+0, ymm0);
1392 _mm256_stream_si256(dst+1, ymm1);
1393 _mm256_stream_si256(dst+2, ymm2);
1394 _mm256_stream_si256(dst+3, ymm3);
1396 ymm0 = _mm256_load_si256(src+4);
1397 ymm1 = _mm256_load_si256(src+5);
1398 ymm2 = _mm256_load_si256(src+6);
1399 ymm3 = _mm256_load_si256(src+7);
1401 _mm256_stream_si256(dst+4, ymm0);
1402 _mm256_stream_si256(dst+5, ymm1);
1403 _mm256_stream_si256(dst+6, ymm2);
1404 _mm256_stream_si256(dst+7, ymm3);
1408 }
while (src < src_end);
1421 __m256i ymm0, ymm1, ymm2, ymm3;
1428 ymm0 = _mm256_loadu_si256(src+0);
1429 ymm1 = _mm256_loadu_si256(src+1);
1430 ymm2 = _mm256_loadu_si256(src+2);
1431 ymm3 = _mm256_loadu_si256(src+3);
1433 _mm256_stream_si256(dst+0, ymm0);
1434 _mm256_stream_si256(dst+1, ymm1);
1435 _mm256_stream_si256(dst+2, ymm2);
1436 _mm256_stream_si256(dst+3, ymm3);
1438 ymm0 = _mm256_loadu_si256(src+4);
1439 ymm1 = _mm256_loadu_si256(src+5);
1440 ymm2 = _mm256_loadu_si256(src+6);
1441 ymm3 = _mm256_loadu_si256(src+7);
1443 _mm256_stream_si256(dst+4, ymm0);
1444 _mm256_stream_si256(dst+5, ymm1);
1445 _mm256_stream_si256(dst+6, ymm2);
1446 _mm256_stream_si256(dst+7, ymm3);
1450 }
while (src < src_end);
1466 __m256i maskFF = _mm256_set1_epi32(-1);
1473 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);
1474 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);
1476 _mm256_store_si256(dst+0, ymm0);
1477 _mm256_store_si256(dst+1, ymm1);
1479 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);
1480 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);
1482 _mm256_store_si256(dst+2, ymm0);
1483 _mm256_store_si256(dst+3, ymm1);
1487 }
while (dst < dst_end);
1502 __m256i w0 = _mm256_load_si256(block+0);
1503 __m256i w1 = _mm256_load_si256(block+1);
1505 __m256i wA = _mm256_or_si256(w0, w1);
1507 __m256i w2 = _mm256_load_si256(block+2);
1508 __m256i w3 = _mm256_load_si256(block+3);
1510 __m256i wB = _mm256_or_si256(w2, w3);
1511 wA = _mm256_or_si256(wA, wB);
1513 if (!_mm256_testz_si256(wA, wA))
1516 }
while (block < block_end);
1527 __m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));
1528 __m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));
1529 wA = _mm256_or_si256(wA, wB);
1531 return _mm256_testz_si256(wA, wA);
1541 __m256i mV = _mm256_set1_epi32(
int(value));
1542 _mm256_store_si256(dst, mV);
1543 _mm256_store_si256(dst + 1, mV);
1544 _mm256_store_si256(dst + 2, mV);
1545 _mm256_store_si256(dst + 3, mV);
1556 const __m256i maskF = _mm256_set1_epi32(~0u);
1561 __m256i m1A = _mm256_load_si256(block+0);
1562 __m256i m1B = _mm256_load_si256(block+1);
1563 m1A = _mm256_xor_si256(m1A, maskF);
1564 m1B = _mm256_xor_si256(m1B, maskF);
1565 m1A = _mm256_or_si256(m1A, m1B);
1566 if (!_mm256_testz_si256(m1A, m1A))
1569 }
while (block < block_end);
1580 __m256i maskF = _mm256_set1_epi32(~0u);
1581 __m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF);
1582 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1583 return (maskA == ~0u);
1594 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1595 return _mm256_testz_si256(w0, w0);
1605 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1606 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1607 w0 = _mm256_or_si256(w0, w1);
1608 return _mm256_testz_si256(w0, w0);
1618 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1619 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1620 w0 = _mm256_xor_si256(w0, w1);
1621 return _mm256_testz_si256(w0, w0);
1631 __m256i* block_end =
1634 __m256i m1COshft, m2COshft;
1635 __m256i mAcc = _mm256_set1_epi32(0);
1636 __m256i mMask1 = _mm256_set1_epi32(1);
1637 __m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
1640 for (--block_end; block_end >= block; block_end -= 2)
1642 __m256i m1A = _mm256_load_si256(block_end);
1643 __m256i m2A = _mm256_load_si256(block_end-1);
1645 __m256i m1CO = _mm256_and_si256(m1A, mMask1);
1646 __m256i m2CO = _mm256_and_si256(m2A, mMask1);
1648 co2 = _mm256_extract_epi32(m1CO, 0);
1650 m1A = _mm256_srli_epi32(m1A, 1);
1651 m2A = _mm256_srli_epi32(m2A, 1);
1654 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1655 m1COshft = _mm256_insert_epi32(m1COshft, co1, 7);
1659 co2 = _mm256_extract_epi32(m2CO, 0);
1661 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1662 m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);
1664 m1COshft = _mm256_slli_epi32(m1COshft, 31);
1665 m2COshft = _mm256_slli_epi32(m2COshft, 31);
1667 m1A = _mm256_or_si256(m1A, m1COshft);
1668 m2A = _mm256_or_si256(m2A, m2COshft);
1670 _mm256_store_si256(block_end, m1A);
1671 _mm256_store_si256(block_end-1, m2A);
1673 mAcc = _mm256_or_si256(mAcc, m1A);
1674 mAcc = _mm256_or_si256(mAcc, m2A);
1680 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1692 const __m256i* block_end =
1695 __m256i m1COshft, m2COshft;
1696 __m256i mAcc = _mm256_set1_epi32(0);
1697 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1700 for (;block < block_end; block+=2)
1702 __m256i m1A = _mm256_load_si256(block);
1703 __m256i m2A = _mm256_load_si256(block+1);
1705 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1706 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1708 co2 = _mm256_extract_epi32(m1CO, 7);
1710 m1A = _mm256_slli_epi32(m1A, 1);
1711 m2A = _mm256_slli_epi32(m2A, 1);
1714 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1715 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
1719 co2 = _mm256_extract_epi32(m2CO, 7);
1720 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1721 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1723 m1A = _mm256_or_si256(m1A, m1COshft);
1724 m2A = _mm256_or_si256(m2A, m2COshft);
1726 _mm256_store_si256(block, m1A);
1727 _mm256_store_si256(block+1, m2A);
1729 mAcc = _mm256_or_si256(mAcc, m1A);
1730 mAcc = _mm256_or_si256(mAcc, m2A);
1735 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1756 __m256i m1COshft, m2COshft;
1757 __m256i mAcc = _mm256_set1_epi32(0);
1758 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1763 unsigned di = co1 ? 0 : unsigned(_tzcnt_u64(d));
1764 for (; di < 64 ; ++di)
1770 mAcc = _mm256_xor_si256(mAcc, mAcc);
1772 mask_block = (__m256i*) &mblock[d_base];
1773 _mm_prefetch ((
const char*)mask_block, _MM_HINT_NTA);
1775 block = (__m256i*) &wblock[d_base];
1777 for (
unsigned i = 0; i < 2; ++i, block += 2, mask_block += 2)
1779 __m256i m1A = _mm256_load_si256(block);
1780 __m256i m2A = _mm256_load_si256(block+1);
1782 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1783 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1785 co2 = _mm256_extract_epi32(m1CO, 7);
1787 m1A = _mm256_slli_epi32(m1A, 1);
1788 m2A = _mm256_slli_epi32(m2A, 1);
1790 __m256i m1M = _mm256_load_si256(mask_block);
1791 __m256i m2M = _mm256_load_si256(mask_block+1);
1794 m1COshft = _mm256_insert_epi32(
1795 _mm256_permutevar8x32_epi32(m1CO, mCOidx),
1799 co2 = _mm256_extract_epi32(m2CO, 7);
1800 m2COshft = _mm256_insert_epi32(
1801 _mm256_permutevar8x32_epi32(m2CO, mCOidx),
1804 m1A = _mm256_or_si256(m1A, m1COshft);
1805 m2A = _mm256_or_si256(m2A, m2COshft);
1807 m1A = _mm256_and_si256(m1A, m1M);
1808 m2A = _mm256_and_si256(m2A, m2M);
1810 _mm256_store_si256(block, m1A);
1811 _mm256_store_si256(block+1, m2A);
1813 mAcc = _mm256_or_si256(mAcc, m1A);
1814 mAcc = _mm256_or_si256(mAcc, m2A);
1820 if (_mm256_testz_si256(mAcc, mAcc))
1832 bm::id64_t w0 = wblock[d_base] = (co1 & mblock[d_base]);
1833 d |= (dmask & (w0 << di));
1875 const __m256i* block_end =
1876 (
const __m256i*)((
bm::word_t*)(block) + size);
1878 __m256i m1COshft, m2COshft;
1879 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1880 __m256i cntAcc = _mm256_setzero_si256();
1887 unsigned co2, co1 = 0;
1888 for (;block < block_end; block+=2)
1890 __m256i m1A = _mm256_load_si256(block);
1891 __m256i m2A = _mm256_load_si256(block+1);
1893 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1894 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1896 co2 = _mm256_extract_epi32(m1CO, 7);
1898 __m256i m1As = _mm256_slli_epi32(m1A, 1);
1899 __m256i m2As = _mm256_slli_epi32(m2A, 1);
1902 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1903 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
1907 co2 = _mm256_extract_epi32(m2CO, 7);
1908 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1909 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1911 m1As = _mm256_or_si256(m1As, m1COshft);
1912 m2As = _mm256_or_si256(m2As, m2COshft);
1917 m1A = _mm256_xor_si256(m1A, m1As);
1918 m2A = _mm256_xor_si256(m2A, m2As);
1922 cntAcc = _mm256_add_epi64(cntAcc, bc);
1924 cntAcc = _mm256_add_epi64(cntAcc, bc);
1929 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
1930 count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
1950 (
const __m256i*)((
bm::word_t*)(block) + size);
1952 __m256i m1COshft, m2COshft;
1953 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1955 __m256i cntAcc = _mm256_setzero_si256();
1956 __m256i cntAcc2 = _mm256_setzero_si256();
1959 unsigned bit_count = 0;
1960 unsigned gap_count = 1;
1964 unsigned co2, co1 = 0;
1965 for (;block < block_end; block+=2, xor_block+=2)
1967 __m256i m1A = _mm256_load_si256(block);
1968 __m256i m2A = _mm256_load_si256(block+1);
1969 __m256i m1B = _mm256_load_si256(xor_block);
1970 __m256i m2B = _mm256_load_si256(xor_block+1);
1972 m1A = _mm256_xor_si256 (m1A, m1B);
1973 m2A = _mm256_xor_si256 (m2A, m2B);
1977 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
1979 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
1982 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1983 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1985 co2 = _mm256_extract_epi32(m1CO, 7);
1987 __m256i m1As = _mm256_slli_epi32(m1A, 1);
1988 __m256i m2As = _mm256_slli_epi32(m2A, 1);
1991 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1992 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
1996 co2 = _mm256_extract_epi32(m2CO, 7);
1997 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1998 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2000 m1As = _mm256_or_si256(m1As, m1COshft);
2001 m2As = _mm256_or_si256(m2As, m2COshft);
2006 m1A = _mm256_xor_si256(m1A, m1As);
2007 m2A = _mm256_xor_si256(m2A, m2As);
2011 cntAcc = _mm256_add_epi64(cntAcc, bc);
2013 cntAcc = _mm256_add_epi64(cntAcc, bc);
2018 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2019 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2020 gap_count -= (w0 & 1u);
2024 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);
2025 bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2027 *gcount = gap_count;
2028 *bcount = bit_count;
2039 unsigned* gcount,
unsigned* bcount)
2043 const __m256i* block_end =
2046 __m256i m1COshft, m2COshft;
2047 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2048 __m256i cntAcc = _mm256_setzero_si256();
2051 unsigned bit_count = 0;
2052 unsigned gap_count = 1;
2056 unsigned co2, co1 = 0;
2057 for (;block < block_end; block+=2)
2059 __m256i m1A = _mm256_load_si256(block);
2060 __m256i m2A = _mm256_load_si256(block+1);
2066 bit_count += (unsigned) (_mm_popcnt_u64(b64[0]) + _mm_popcnt_u64(b64[1]));
2067 bit_count += (unsigned)(_mm_popcnt_u64(b64[2]) + _mm_popcnt_u64(b64[3]));
2069 bit_count += (unsigned)(_mm_popcnt_u64(b64[4]) + _mm_popcnt_u64(b64[5]));
2070 bit_count += (unsigned)(_mm_popcnt_u64(b64[6]) + _mm_popcnt_u64(b64[7]));
2073 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2074 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2076 co2 = _mm256_extract_epi32(m1CO, 7);
2078 __m256i m1As = _mm256_slli_epi32(m1A, 1);
2079 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2082 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2083 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2087 co2 = _mm256_extract_epi32(m2CO, 7);
2088 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2089 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2091 m1As = _mm256_or_si256(m1As, m1COshft);
2092 m2As = _mm256_or_si256(m2As, m2COshft);
2097 m1A = _mm256_xor_si256(m1A, m1As);
2098 m2A = _mm256_xor_si256(m2A, m2As);
2102 cntAcc = _mm256_add_epi64(cntAcc, bc);
2104 cntAcc = _mm256_add_epi64(cntAcc, bc);
2109 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2110 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2111 gap_count -= (w0 & 1u);
2113 *gcount = gap_count;
2114 *bcount = bit_count;
2129 const __m256i* block1_end =
2131 __m256i maskZ = _mm256_setzero_si256();
2133 unsigned simd_lane = 0;
2136 mA = _mm256_xor_si256(_mm256_load_si256(block1),
2137 _mm256_load_si256(block2));
2138 mB = _mm256_xor_si256(_mm256_load_si256(block1+1),
2139 _mm256_load_si256(block2+1));
2140 __m256i mOR = _mm256_or_si256(mA, mB);
2141 if (!_mm256_testz_si256(mOR, mOR))
2143 if (!_mm256_testz_si256(mA, mA))
2146 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2148 int bsf = bm::bsf_asm32(mask);
2149 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2150 unsigned widx = bsf >> 2;
2151 unsigned w = simd_buf[widx];
2152 bsf = bm::bsf_asm32(w);
2153 *pos = (simd_lane * 256) + (widx * 32) + bsf;
2157 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2159 int bsf = bm::bsf_asm32(mask);
2160 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2161 unsigned widx = bsf >> 2;
2162 unsigned w = simd_buf[widx];
2163 bsf = bm::bsf_asm32(w);
2164 *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2169 block1+=2; block2+=2;
2171 }
while (block1 < block1_end);
2185 const __m256i* block_end =
2187 __m256i maskZ = _mm256_setzero_si256();
2189 unsigned simd_lane = 0;
2192 mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);
2193 __m256i mOR = _mm256_or_si256(mA, mB);
2194 if (!_mm256_testz_si256(mOR, mOR))
2196 if (!_mm256_testz_si256(mA, mA))
2199 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2201 int bsf = bm::bsf_asm32(mask);
2202 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2203 unsigned widx = bsf >> 2;
2204 unsigned w = simd_buf[widx];
2205 bsf = bm::bsf_asm32(w);
2206 *pos = (simd_lane * 256) + (widx * 32) + bsf;
2210 unsigned mask = ~~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2212 int bsf = bm::bsf_asm32(mask);
2213 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2214 unsigned widx = bsf >> 2;
2215 unsigned w = simd_buf[widx];
2216 bsf = bm::bsf_asm32(w);
2217 *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2224 }
while (block < block_end);
2240 unsigned avx_vect_waves,
2243 __m256i xcnt = _mm256_setzero_si256();
2248 for (
unsigned i = 0; i < avx_vect_waves; ++i)
2250 __m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
2251 __m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
2252 __m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
2253 xcnt = _mm256_add_epi16(xcnt, ymm_s2);
2258 xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
2263 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
2264 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
2265 __m128i xcnt2 = _mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
2268 *sum += _mm_cvtsi128_si32(xcnt2) & 0xffff;
2279 unsigned nb,
unsigned start)
2281 const unsigned unroll_factor = 16;
2282 const unsigned len = (size - start);
2283 const unsigned len_unr = len - (len % unroll_factor);
2288 __m256i nbM = _mm256_set1_epi32(
int(nb));
2290 for (k = 0; k < len_unr; k+=unroll_factor)
2292 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2295 __m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
2296 if (~0u !=
unsigned(_mm256_movemask_epi8(wcmpA)))
2298 __m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
2301 __m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
2302 if (~0u !=
unsigned(_mm256_movemask_epi8(wcmpB)))
2305 for (; k < len; ++k)
2321 unsigned start,
unsigned stop )
2323 const unsigned unroll_factor = 8;
2324 const unsigned len = (stop - start);
2325 const unsigned len_unr = len - (len % unroll_factor);
2331 __m256i mask1 = _mm256_set1_epi32(1);
2337 unsigned k = 0, mask, w_idx;
2338 for (; k < len_unr; k+=unroll_factor)
2340 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2341 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask);
2344 nbitA = _mm256_and_si256 (nbitA, sw_mask);
2346 __m256i maskA = _mm256_sllv_epi32(mask1, nbitA);
2348 _mm256_store_si256 ((__m256i*)mword_v, nwordA);
2351 mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
2352 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2353 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2357 mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp);
2358 mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2362 __m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);
2363 __m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);
2364 mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);
2365 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4);
2366 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2367 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8);
2368 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2370 int u0 = _mm256_extract_epi32(mask_tmp, 0);
2375 _mm256_store_si256 ((__m256i*)mask_v, maskA);
2383 mask_tmp = _mm256_bsrli_epi128(maskA, 4);
2384 mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2385 __m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8);
2386 mask_tmp = _mm256_or_si256 (m0, mask_tmp);
2388 u0 = _mm256_extract_epi32(mask_tmp, 0);
2389 u4 = _mm256_extract_epi32(mask_tmp, 4);
2394 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0);
2395 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2396 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2404 block[mword_v[0]] |= mask_v[0];
2405 block[mword_v[1]] |= mask_v[1];
2406 block[mword_v[2]] |= mask_v[2];
2407 block[mword_v[3]] |= mask_v[3];
2414 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1);
2415 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2416 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2424 block[mword_v[4]] |= mask_v[4];
2425 block[mword_v[5]] |= mask_v[5];
2426 block[mword_v[6]] |= mask_v[6];
2427 block[mword_v[7]] |= mask_v[7];
2433 for (; k < len; ++k)
2435 unsigned n = idx[k];
2439 block[nword] |= (1u << nbit);
2450 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2451 __m256i mask1 = _mm256_set1_epi32(1);
2453 __m256i v0, v1, acc1, acc2;
2454 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(0));
2455 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(1));
2456 v0 = _mm256_sub_epi32(v0, stride_idx);
2457 v1 = _mm256_sub_epi32(v1, stride_idx);
2458 v0 = _mm256_sllv_epi32(mask1, v0);
2459 v1 = _mm256_sllv_epi32(mask1, v1);
2460 acc1 = _mm256_or_si256(v1, v0);
2461 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(2));
2462 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(3));
2463 v0 = _mm256_sub_epi32(v0, stride_idx);
2464 v1 = _mm256_sub_epi32(v1, stride_idx);
2465 v0 = _mm256_sllv_epi32(mask1, v0);
2466 v1 = _mm256_sllv_epi32(mask1, v1);
2467 acc2 = _mm256_or_si256(v1, v0);
2468 target = _mm256_or_si256(target, acc1);
2469 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(4));
2470 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(5));
2471 v0 = _mm256_sub_epi32(v0, stride_idx);
2472 v1 = _mm256_sub_epi32(v1, stride_idx);
2473 v0 = _mm256_sllv_epi32(mask1, v0);
2474 v1 = _mm256_sllv_epi32(mask1, v1);
2475 acc1 = _mm256_or_si256(v1, v0);
2476 target = _mm256_or_si256(target, acc2);
2477 v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(6));
2478 v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(7));
2479 v0 = _mm256_sub_epi32(v0, stride_idx);
2480 v1 = _mm256_sub_epi32(v1, stride_idx);
2481 v0 = _mm256_sllv_epi32(mask1, v0);
2482 v1 = _mm256_sllv_epi32(mask1, v1);
2483 acc2 = _mm256_or_si256(v1, v0);
2485 target = _mm256_or_si256(target, acc1);
2486 target = _mm256_or_si256(target, acc2);
2497 unsigned start,
unsigned stop )
2499 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2500 __m256i mask1 = _mm256_set1_epi32(1);
2501 __m256i* block_avx = (__m256i*)block;
2503 unsigned stride = 0;
2504 __m256i* avx_stride_p = block_avx + stride;
2505 __m256i blkA = _mm256_load_si256(avx_stride_p);
2507 for (
unsigned i = start; i < stop; ++i)
2509 unsigned n = idx[i];
2511 unsigned new_stride = nbit >> 8;
2512 unsigned stride_bit = nbit & 0xFF;
2513 if (new_stride != stride)
2515 _mm256_store_si256(avx_stride_p, blkA);
2516 stride = new_stride;
2517 avx_stride_p = block_avx + stride;
2518 blkA = _mm256_load_si256(avx_stride_p);
2521 __m256i v0 = _mm256_set1_epi32(stride_bit);
2522 __m256i s0 = _mm256_sub_epi32(v0, stride_idx);
2523 __m256i k0 = _mm256_sllv_epi32(mask1, s0);
2524 blkA = _mm256_or_si256(blkA, k0);
2527 _mm256_store_si256(avx_stride_p, blkA);
2536 unsigned start,
unsigned stop )
2538 const unsigned unroll_factor = 8;
2539 const unsigned len = (stop - start);
2540 const unsigned len_unr = len - (len % unroll_factor);
2544 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2545 __m256i mask1 = _mm256_set1_epi32(1);
2548 __m256i stride_bit_mask = _mm256_set1_epi32(0xFF);
2556 __m256i* block_avx = (__m256i*)block;
2557 __m256i* avx_stride_p = block_avx + stride;
2559 __m256i blkA = _mm256_load_si256(avx_stride_p);
2561 unsigned k = 0, mask;
2562 for (; k < len_unr; k+=unroll_factor)
2564 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2565 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask);
2566 __m256i strideA = _mm256_srli_epi32 (nbitA, 8);
2567 __m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask);
2570 __m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);
2571 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2572 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));
2575 unsigned new_stride = (unsigned)_mm256_extract_epi32(strideA, 0);
2576 if (new_stride != stride)
2578 _mm256_store_si256(avx_stride_p, blkA);
2579 stride = new_stride;
2580 avx_stride_p = block_avx + stride;
2581 blkA = _mm256_load_si256(avx_stride_p);
2588 _mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA);
2589 _mm256_store_si256 ((__m256i*)mstride_v, strideA);
2590 for (
unsigned j = 0; j < 8; ++j)
2592 unsigned new_stride = mstride_v[j];
2593 if (new_stride != stride)
2595 _mm256_store_si256(avx_stride_p, blkA);
2596 stride = new_stride;
2597 avx_stride_p = block_avx + stride;
2598 blkA = _mm256_load_si256(avx_stride_p);
2601 mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);
2602 mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);
2603 mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);
2604 blkA = _mm256_or_si256(blkA, mask_tmp);
2608 _mm256_store_si256(avx_stride_p, blkA);
2611 for (; k < len; ++k)
2613 unsigned n = idx[k];
2617 block[nword] |= (1u << nbit);
2629 __m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2630 __m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));
2631 __m256i maskFF = _mm256_set1_epi32(-1);
2632 __m256i maskZ = _mm256_setzero_si256();
2634 __m256i v0 = _mm256_set1_epi32(i);
2635 __m256i s0 = _mm256_sub_epi32(v0, stride_idx1);
2636 __m256i k1 = _mm256_sllv_epi32(maskFF, s0);
2639 __m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);
2640 cmp_eq = _mm256_xor_si256(maskFF, cmp_eq);
2641 k1 = _mm256_xor_si256(k1, cmp_eq);
2644 __m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);
2645 cmp_gt = _mm256_xor_si256(maskFF, cmp_gt);
2646 __m256i r = _mm256_xor_si256(k1, cmp_gt);
2666 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2667 __m256i mm_val = _mm256_set1_epi32(value);
2669 __m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8);
2670 __m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8);
2672 __m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);
2673 __m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);
2675 __m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);
2676 int mask = _mm256_movemask_epi8(cmp_mask_ge);
2679 int bsf = bm::bsf_asm32(mask);
2695 __m256i mZ = _mm256_setzero_si256();
2696 __m256i mVal = _mm256_set1_epi16(value);
2699 __m256i mSub = _mm256_subs_epu16(mVal, vect16);
2700 __m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2701 unsigned mask = _mm256_movemask_epi8(mge_mask);
2704 int lz = _tzcnt_u32(mask);
2729 const unsigned linear_cutoff = 48;
2730 const unsigned unroll_factor = 16;
2736 unsigned end = 1 + ((*buf) >> 3);
2737 unsigned arr_end = end;
2739 if (end - start < unroll_factor)
2741 for (; start < end; ++start)
2743 if (buf[start] >= pos)
2745 res = ((*buf) & 1) ^ ((start-1) & 1);
2753 while (start != end)
2755 unsigned dsize = end - start;
2756 if (dsize < linear_cutoff)
2761 dsize = arr_end - start;
2763 __m256i mZ = _mm256_setzero_si256();
2764 __m256i mPos = _mm256_set1_epi16((
unsigned short)pos);
2765 __m256i vect16, mSub, mge_mask;
2767 unsigned len_unr = start + (dsize - (dsize % unroll_factor));
2768 for (; start < len_unr; start += unroll_factor)
2770 vect16 = _mm256_loadu_si256((__m256i*)(&buf[start]));
2771 mSub = _mm256_subs_epu16(mPos, vect16);
2772 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2773 int mask = _mm256_movemask_epi8(mge_mask);
2776 int lz = _tzcnt_u32(mask) / 2;
2778 res = ((*buf) & 1) ^ ((start-1) & 1);
2783 unsigned tail = unroll_factor - (end - start);
2787 vect16 = _mm256_loadu_si256((__m256i*)(&buf[start]));
2788 mSub = _mm256_subs_epu16(mPos, vect16);
2789 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2790 int mask = _mm256_movemask_epi8(mge_mask);
2793 int lz = _tzcnt_u32(mask) / 2;
2795 res = ((*buf) & 1) ^ ((start-1) & 1);
2799 for (; start < end; ++start)
2801 if (buf[start] >= pos)
2806 unsigned curr = (start + end) >> 1;
2807 if (buf[curr] < pos)
2812 res = ((*buf) & 1) ^ ((start-1) & 1);
2845 const unsigned*
BMRESTRICT arr_base = &arr[from];
2847 unsigned unroll_factor = 8;
2848 unsigned len = to - from + 1;
2849 unsigned len_unr = len - (len % unroll_factor);
2851 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2852 __m256i vect_target = _mm256_set1_epi32(target);
2853 __m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8);
2856 __m256i vect80, norm_vect80, cmp_mask_ge;
2859 for (; k < len_unr; k += unroll_factor)
2861 vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k]));
2862 norm_vect80 = _mm256_sub_epi32(vect80, mask0x8);
2864 cmp_mask_ge = _mm256_or_si256(
2865 _mm256_cmpgt_epi32(norm_vect80, norm_target),
2866 _mm256_cmpeq_epi32(vect80, vect_target)
2868 mask = _mm256_movemask_epi8(cmp_mask_ge);
2871 int bsf = bm::bsf_asm32(mask);
2872 return from + k + (bsf / 4);
2876 for (; k < len; ++k)
2878 if (arr_base[k] >= target)
2916 const unsigned unroll_factor = 8;
2917 const unsigned len = (size - start);
2918 const unsigned len_unr = len - (len % unroll_factor);
2922 __m256i maskFF = _mm256_set1_epi32(~0u);
2924 __m256i mask_tmp, mask_0;
2928 unsigned k = 0, mask, w_idx;
2929 for (; k < len_unr; k+=unroll_factor)
2931 __m256i nbitA, nwordA;
2932 const unsigned base = start + k;
2933 __m256i* idx_ptr = (__m256i*)(idx+base);
2935 nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask);
2939 mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
2940 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2941 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2942 _mm256_store_si256((__m256i*)mword_v, nwordA);
2947 mask_tmp = _mm256_set1_epi32(blk[w_idx]);
2951 mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
2952 blk[mword_v[5]], blk[mword_v[4]],
2953 blk[mword_v[3]], blk[mword_v[2]],
2954 blk[mword_v[1]], blk[mword_v[0]]);
2959 __m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
2960 __m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
2961 mask_0 = _mm256_sllv_epi32(mask1, shiftA);
2963 mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
2964 if (!_mm256_testz_si256(mask_tmp, mask_tmp))
2966 __m256i* target_ptr = (__m256i*)(arr+base);
2968 __m256i maskZ = _mm256_xor_si256(maskFF, maskFF);
2969 mask1 = _mm256_slli_epi32(mask1, bit_idx);
2970 mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ);
2971 mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF);
2972 mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
2973 _mm256_storeu_si256 (target_ptr,
2974 _mm256_or_si256 (mask_tmp,
2975 _mm256_loadu_si256(target_ptr)));
2980 for (; k < len; ++k)
2982 const unsigned base = start + k;
3003 unsigned bitval = (*block) & 1u;
3006 unsigned bit_idx = 0;
3008 const unsigned vCAP = 64;
3009 __m256i maskZ = _mm256_set1_epi32(0);
3011 for (; block < block_end; block += 8)
3017 __m256i accA = _mm256_load_si256((__m256i*)block);
3018 __m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);
3019 unsigned mask = ~~_mm256_movemask_epi8(cmpA);
3025 unsigned w64_idx = _tzcnt_u32(mask);
3027 bit_idx += k * vCAP;
3034 if (!val || val == ~0ull)
3037 bool cmp = (bool(bitval) != bool(val));
3038 unsigned mask = ~(cmp - 1u);
3040 bitval ^= unsigned(cmp);
3041 unsigned long long pcu =
reinterpret_cast<unsigned long long>(pcurr);
3051 unsigned bits_consumed = 0;
3055 if (bitval != (val & tz))
3060 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3065 tz = (unsigned)_tzcnt_u64(bitval ? ~val : val);
3068 bool cmp = ((bits_consumed+=tz) < vCAP);
3076 bitval ^= unsigned(cmp);
3077 bit_idx += tz & (vCAP - bits_consumed);
3078 unsigned long long pcu =
reinterpret_cast<unsigned long long>(pcurr);
3082 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3092 unsigned len = (unsigned)(pcurr - dest);
3093 *dest = (
gap_word_t)((*dest & 7) + (len << 3));
3117 const __m256i* sub_block = (__m256i*) (block + off);
3118 __m256i* t_sub_block = (__m256i*)(target_block + off);
3122 const __m256i* xor_sub_block = (__m256i*) (xor_block + off);
3123 __m256i mA, mB, mC, mD;
3124 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3125 _mm256_load_si256(xor_sub_block));
3126 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3127 _mm256_load_si256(xor_sub_block+1));
3128 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3129 _mm256_load_si256(xor_sub_block+2));
3130 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3131 _mm256_load_si256(xor_sub_block+3));
3133 _mm256_store_si256(t_sub_block, mA);
3134 _mm256_store_si256(t_sub_block+1, mB);
3135 _mm256_store_si256(t_sub_block+2, mC);
3136 _mm256_store_si256(t_sub_block+3, mD);
3140 _mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));
3141 _mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));
3142 _mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));
3143 _mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));
3167 unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
3170 const __m256i* sub_block = (
const __m256i*) (xor_block + off);
3171 __m256i* t_sub_block = (__m256i*)(target_block + off);
3173 __m256i mA, mB, mC, mD;
3174 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3175 _mm256_load_si256(t_sub_block));
3176 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3177 _mm256_load_si256(t_sub_block+1));
3178 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3179 _mm256_load_si256(t_sub_block+2));
3180 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3181 _mm256_load_si256(t_sub_block+3));
3183 _mm256_store_si256(t_sub_block, mA);
3184 _mm256_store_si256(t_sub_block+1, mB);
3185 _mm256_store_si256(t_sub_block+2, mC);
3186 _mm256_store_si256(t_sub_block+3, mD);
3196#pragma GCC diagnostic pop
3200#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
3201 avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3203#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
3204 avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3206#define VECT_BITCOUNT(first, last) \
3207 avx2_bit_count((__m256i*) (first), (__m256i*) (last))
3209#define VECT_BITCOUNT_AND(first, last, mask) \
3210 avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3212#define VECT_BITCOUNT_OR(first, last, mask) \
3213 avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3215#define VECT_BITCOUNT_XOR(first, last, mask) \
3216 avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3218#define VECT_BITCOUNT_SUB(first, last, mask) \
3219 avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3221#define VECT_INVERT_BLOCK(first) \
3222 avx2_invert_block((__m256i*)first);
3224#define VECT_AND_BLOCK(dst, src) \
3225 avx2_and_block((__m256i*) dst, (const __m256i*) (src))
3227#define VECT_AND_DIGEST(dst, src) \
3228 avx2_and_digest((__m256i*) dst, (const __m256i*) (src))
3230#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
3231 avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3233#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
3234 avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3236#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3237 avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3239#define VECT_OR_BLOCK(dst, src) \
3240 avx2_or_block((__m256i*) dst, (__m256i*) (src))
3242#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3243 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3245#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
3246 avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3248#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3249 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3251#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
3252 avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4))
3254#define VECT_SUB_BLOCK(dst, src) \
3255 avx2_sub_block((__m256i*) dst, (__m256i*) (src))
3257#define VECT_SUB_DIGEST(dst, src) \
3258 avx2_sub_digest((__m256i*) dst, (const __m256i*) (src))
3260#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
3261 avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3263#define VECT_XOR_BLOCK(dst, src) \
3264 avx2_xor_block((__m256i*) dst, (__m256i*) (src))
3266#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
3267 avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3269#define VECT_COPY_BLOCK(dst, src) \
3270 avx2_copy_block((__m256i*) dst, (__m256i*) (src))
3272#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
3273 avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src))
3275#define VECT_STREAM_BLOCK(dst, src) \
3276 avx2_stream_block((__m256i*) dst, (__m256i*) (src))
3278#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
3279 avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src))
3281#define VECT_SET_BLOCK(dst, value) \
3282 avx2_set_block((__m256i*) dst, (value))
3284#define VECT_IS_ZERO_BLOCK(dst) \
3285 avx2_is_all_zero((__m256i*) dst)
3287#define VECT_IS_ONE_BLOCK(dst) \
3288 avx2_is_all_one((__m256i*) dst)
3290#define VECT_IS_DIGEST_ZERO(start) \
3291 avx2_is_digest_zero((__m256i*)start)
3293#define VECT_BLOCK_SET_DIGEST(dst, val) \
3294 avx2_block_set_digest((__m256i*)dst, val)
3296#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
3297 avx2_lower_bound_scan_u32(arr, target, from, to)
3299#define VECT_SHIFT_L1(b, acc, co) \
3300 avx2_shift_l1((__m256i*)b, acc, co)
3302#define VECT_SHIFT_R1(b, acc, co) \
3303 avx2_shift_r1((__m256i*)b, acc, co)
3305#define VECT_SHIFT_R1_AND(b, co, m, digest) \
3306 avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest)
3308#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
3309 avx2_idx_arr_block_lookup(idx, size, nb, start)
3311#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
3312 avx2_set_block_bits3(block, idx, start, stop)
3314#define VECT_BLOCK_CHANGE(block, size) \
3315 avx2_bit_block_calc_change((__m256i*)block, size)
3317#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
3318 avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc)
3320#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
3321 avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc)
3323#define VECT_BIT_TO_GAP(dest, src, dest_len) \
3324 avx2_bit_to_gap(dest, src, dest_len)
3326#define VECT_BIT_FIND_FIRST(src1, pos) \
3327 avx2_bit_find_first((__m256i*) src1, pos)
3329#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
3330 avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos)
3332#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
3333 avx2_bit_block_xor(t, src, src_xor, d)
3335#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
3336 avx2_bit_block_xor_2way(t, src_xor, d)
3338#define VECT_GAP_BFIND(buf, pos, is_set) \
3339 avx2_gap_bfind(buf, pos, is_set)
3341#define VECT_BIT_COUNT_DIGEST(blk, d) \
3342 avx2_bit_block_count(blk, d)
#define BM_AVX2_POPCNT_PROLOG
#define BM_CSA256(h, l, a, b, c)
#define BM_AVX2_BIT_COUNT(ret, v)
Bit manipulation primitives (internal)
unsigned avx2_xor_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
XOR block against another dst ^= *src.
bool avx2_sub_digest(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
bool avx2_and_digest_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
AND block digest stride.
bool avx2_or_block_5way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2, const __m256i *BMRESTRICT src3, const __m256i *BMRESTRICT src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
void avx2_bit_block_calc_change_bc(const __m256i *BMRESTRICT block, unsigned *gcount, unsigned *bcount)
bm::id_t avx2_bit_count_sub(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
AND NOT bit count for two aligned bit-blocks.
bool avx2_or_block_3way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
bm::id_t avx2_bit_count_xor(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
XOR bit count for two aligned bit-blocks.
bool avx2_and_digest(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND block digest stride dst &= *src.
BMFORCEINLINE bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
void avx2_copy_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy dst = *src.
BMFORCEINLINE bool avx2_test_all_one_wave(const void *ptr)
check if wave of pointers is all 0xFFF
bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift right by 1
bool avx2_is_digest_zero(const __m256i *BMRESTRICT block)
check if digest stride is all zero bits
bool avx2_or_arr_unal(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end)
OR array elements against another unaligned array dst |= *src.
void avx2_bit_block_calc_xor_change(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gcount, unsigned *BMRESTRICT bcount)
bool avx2_and_or_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND-OR block digest stride 2 way dst |= *src1 & *src2.
bool avx2_bit_find_first_diff(const __m256i *BMRESTRICT block1, const __m256i *BMRESTRICT block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
unsigned avx2_bit_to_gap(gap_word_t *BMRESTRICT dest, const unsigned *BMRESTRICT block, unsigned dest_len)
Convert bit block to GAP block.
BMFORCEINLINE void avx2_set_block(__m256i *BMRESTRICT dst, bm::word_t value)
AVX2 block memset dst = value.
bool avx2_shift_r1_and(__m256i *BMRESTRICT block, bm::word_t co1, const __m256i *BMRESTRICT mask_block, bm::id64_t *BMRESTRICT digest)
fused block shift right by 1 plus AND
void avx2_copy_block_unalign(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy (unaligned SRC) dst = *src.
bool avx2_or_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
OR array elements against another array dst |= *src.
void avx2_stream_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy dst = *src.
unsigned avx2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
bm::id_t avx2_bit_count_and(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
AND bit count for two aligned bit-blocks.
void avx2_andnot_arr_2_mask(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
void avx2_block_set_digest(__m256i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
unsigned avx2_sub_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
bool avx2_is_all_zero(const __m256i *BMRESTRICT block)
check if block is all zero bits
BMFORCEINLINE bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
bool avx2_and_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
AND block digest stride 2 way dst = *src1 & *src2.
bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)
Calculate population count based on digest.
void avx2_stream_block_unalign(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AVX2 block copy (unaligned SRC) dst = *src.
BMFORCEINLINE bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
unsigned avx2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to scan.
unsigned avx2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
bool avx2_bit_find_first(const __m256i *BMRESTRICT block, unsigned *pos)
Find first bit set.
void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
bool avx2_is_all_one(const __m256i *BMRESTRICT block)
check if block is all one bits
unsigned avx2_bit_block_calc_change(const __m256i *BMRESTRICT block, unsigned size)
unsigned avx2_and_arr_unal(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end)
AND array elements against another array (unaligned) dst &= *src.
void avx2_invert_block(__m256i *BMRESTRICT dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
void avx2_xor_arr_2_mask(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src, const __m256i *BMRESTRICT src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
unsigned avx2_xor_block_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
3 operand XOR dst = *src1 ^ src2
bool avx2_sub_digest_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2
unsigned avx2_and_block(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src)
AND array elements against another array dst &= *src.
bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift left by 1
int avx2_cmpge_u16(__m256i vect16, unsigned short value)
Experimental (test) function to do SIMD vector search in sorted, growing array.
int avx2_cmpge_u32(__m256i vect8, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
bm::id_t avx2_bit_count(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...
bool avx2_or_block_2way(__m256i *BMRESTRICT dst, const __m256i *BMRESTRICT src1, const __m256i *BMRESTRICT src2)
OR 2 arrays and copy to the destination dst = *src1 | src2.
const unsigned set_block_digest_wave_size
void avx2_set_block_bits3(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
__m256i avx2_setbit_to256(unsigned i)
Experiemntal.
const unsigned set_block_mask
bm::id_t avx2_bit_count_or(const __m256i *BMRESTRICT block, const __m256i *BMRESTRICT block_end, const __m256i *BMRESTRICT mask_block)
void avx2_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
const unsigned set_word_shift
const unsigned set_block_size
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
unsigned long long int id64_t
const unsigned block_waves
BMFORCEINLINE __m256i avx2_setbit_256(__m256i target, __m256i source)
Set a bits in an AVX target, by indexes (int4) from the source.
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
void avx2_set_block_bits2(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
unsigned short gap_word_t
const unsigned gap_max_bits
const unsigned set_block_shift
void avx2_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
const unsigned set_word_mask
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)