1#ifndef BMSSE4__H__INCLUDED__
2#define BMSSE4__H__INCLUDED__
50#pragma GCC diagnostic push
51#pragma GCC diagnostic ignored "-Wconversion"
55#pragma warning( push )
56#pragma warning( disable : 4146)
62# define _mm_popcnt_u32 __builtin_popcount
63# define _mm_popcnt_u64 __builtin_popcountll
64# define BM_BSF32 __builtin_ctz
66# define BM_BSF32 bm::bsf_asm32
101 count += unsigned( _mm_popcnt_u64(b[0]) +
102 _mm_popcnt_u64(b[1]) +
103 _mm_popcnt_u64(b[2]) +
104 _mm_popcnt_u64(b[3]));
110 const unsigned* b = (
unsigned*) block;
111 count += _mm_popcnt_u32(b[0]) +
112 _mm_popcnt_u32(b[1]) +
113 _mm_popcnt_u32(b[2]) +
114 _mm_popcnt_u32(b[3]);
115 }
while (++block < block_end);
128 unsigned ret = (a ^ b);
162 __m128i b = sse2_func(_mm_load_si128(block), _mm_load_si128(mask_block));
163 _mm_store_si128((__m128i*)tcnt, b);
164 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
166 b = sse2_func(_mm_load_si128(block+1), _mm_load_si128(mask_block+1));
167 _mm_store_si128((__m128i*)tcnt, b);
168 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
169 block+=2; mask_block+=2;
170 }
while (block < block_end);
174 __m128i tmp0 = _mm_load_si128(block);
175 __m128i tmp1 = _mm_load_si128(mask_block);
176 __m128i b = sse2_func(tmp0, tmp1);
178 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
179 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
180 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
181 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
183 ++block; ++mask_block;
184 }
while (block < block_end);
198 __m128i maskz = _mm_setzero_si128();
204 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
205 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
207 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
208 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz)))
211 }
while (block < block_end);
222 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
223 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
224 wA = _mm_or_si128(wA, wB);
225 bool z1 = _mm_test_all_zeros(wA, wA);
227 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
228 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
229 wA = _mm_or_si128(wA, wB);
230 bool z2 = _mm_test_all_zeros(wA, wA);
241 __m128i mV = _mm_set1_epi32(
int(value));
242 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
243 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
244 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
245 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
260 __m128i m1A, m1B, m1C, m1D;
261 __m128i accA, accB, accC, accD;
266 accA = accB = accC = accD = _mm_setzero_si128();
270 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
271 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
272 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
273 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
275 _mm_store_si128(dst+0, m1A);
276 _mm_store_si128(dst+1, m1B);
277 _mm_store_si128(dst+2, m1C);
278 _mm_store_si128(dst+3, m1D);
280 accA = _mm_or_si128(accA, m1A);
281 accB = _mm_or_si128(accB, m1B);
282 accC = _mm_or_si128(accC, m1C);
283 accD = _mm_or_si128(accD, m1D);
286 }
while (src < src_end);
288 accA = _mm_or_si128(accA, accB);
289 accC = _mm_or_si128(accC, accD);
290 accA = _mm_or_si128(accA, accC);
292 return !_mm_testz_si128(accA, accA);
307 __m128i m1A, m1B, m1C, m1D;
309 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
310 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
311 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
312 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
314 _mm_store_si128(dst+0, m1A);
315 _mm_store_si128(dst+1, m1B);
316 _mm_store_si128(dst+2, m1C);
317 _mm_store_si128(dst+3, m1D);
319 m1A = _mm_or_si128(m1A, m1B);
320 m1C = _mm_or_si128(m1C, m1D);
321 m1A = _mm_or_si128(m1A, m1C);
323 bool z1 = _mm_testz_si128(m1A, m1A);
325 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
326 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
327 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
328 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
330 _mm_store_si128(dst+4, m1A);
331 _mm_store_si128(dst+5, m1B);
332 _mm_store_si128(dst+6, m1C);
333 _mm_store_si128(dst+7, m1D);
335 m1A = _mm_or_si128(m1A, m1B);
336 m1C = _mm_or_si128(m1C, m1D);
337 m1A = _mm_or_si128(m1A, m1C);
339 bool z2 = _mm_testz_si128(m1A, m1A);
356 __m128i m1A, m1B, m1C, m1D;
358 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
359 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
360 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
361 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
363 _mm_store_si128(dst+0, m1A);
364 _mm_store_si128(dst+1, m1B);
365 _mm_store_si128(dst+2, m1C);
366 _mm_store_si128(dst+3, m1D);
368 m1A = _mm_or_si128(m1A, m1B);
369 m1C = _mm_or_si128(m1C, m1D);
370 m1A = _mm_or_si128(m1A, m1C);
372 bool z1 = _mm_testz_si128(m1A, m1A);
374 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
375 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
376 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
377 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
379 _mm_store_si128(dst+4, m1A);
380 _mm_store_si128(dst+5, m1B);
381 _mm_store_si128(dst+6, m1C);
382 _mm_store_si128(dst+7, m1D);
384 m1A = _mm_or_si128(m1A, m1B);
385 m1C = _mm_or_si128(m1C, m1D);
386 m1A = _mm_or_si128(m1A, m1C);
388 bool z2 = _mm_testz_si128(m1A, m1A);
405 __m128i m1A, m1B, m1C, m1D;
408 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
409 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
410 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
411 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
413 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
414 bool z1 = _mm_testz_si128(mACC1, mACC1);
416 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
417 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
418 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
419 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
421 _mm_store_si128(dst+0, m1A);
422 _mm_store_si128(dst+1, m1B);
423 _mm_store_si128(dst+2, m1C);
424 _mm_store_si128(dst+3, m1D);
427 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
428 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
429 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
430 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
432 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
433 bool z2 = _mm_testz_si128(mACC1, mACC1);
435 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
436 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
437 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
438 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
440 _mm_store_si128(dst+4, m1A);
441 _mm_store_si128(dst+5, m1B);
442 _mm_store_si128(dst+6, m1C);
443 _mm_store_si128(dst+7, m1D);
461 __m128i m1A, m1B, m1C, m1D;
462 __m128i m1E, m1F, m1G, m1H;
464 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
465 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
466 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
467 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
469 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
470 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
471 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
472 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
474 m1A = _mm_and_si128(m1A, m1E);
475 m1B = _mm_and_si128(m1B, m1F);
476 m1C = _mm_and_si128(m1C, m1G);
477 m1D = _mm_and_si128(m1D, m1H);
479 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
480 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
481 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
482 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
484 _mm_store_si128(dst+0, m1A);
485 _mm_store_si128(dst+1, m1B);
486 _mm_store_si128(dst+2, m1C);
487 _mm_store_si128(dst+3, m1D);
489 m1A = _mm_or_si128(m1A, m1B);
490 m1C = _mm_or_si128(m1C, m1D);
491 m1A = _mm_or_si128(m1A, m1C);
493 bool z1 = _mm_testz_si128(m1A, m1A);
495 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
496 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
497 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
498 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
500 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
501 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
502 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
503 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
505 m1A = _mm_and_si128(m1A, m1E);
506 m1B = _mm_and_si128(m1B, m1F);
507 m1C = _mm_and_si128(m1C, m1G);
508 m1D = _mm_and_si128(m1D, m1H);
510 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
511 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
512 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
513 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
515 _mm_store_si128(dst+4, m1A);
516 _mm_store_si128(dst+5, m1B);
517 _mm_store_si128(dst+6, m1C);
518 _mm_store_si128(dst+7, m1D);
520 m1A = _mm_or_si128(m1A, m1B);
521 m1C = _mm_or_si128(m1C, m1D);
522 m1A = _mm_or_si128(m1A, m1C);
524 bool z2 = _mm_testz_si128(m1A, m1A);
541 __m128i m1A, m1B, m1C, m1D;
543 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
544 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
545 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
546 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
548 _mm_store_si128(dst+0, m1A);
549 _mm_store_si128(dst+1, m1B);
550 _mm_store_si128(dst+2, m1C);
551 _mm_store_si128(dst+3, m1D);
553 m1A = _mm_or_si128(m1A, m1B);
554 m1C = _mm_or_si128(m1C, m1D);
555 m1A = _mm_or_si128(m1A, m1C);
557 bool z1 = _mm_testz_si128(m1A, m1A);
559 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
560 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
561 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
562 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
564 _mm_store_si128(dst+4, m1A);
565 _mm_store_si128(dst+5, m1B);
566 _mm_store_si128(dst+6, m1C);
567 _mm_store_si128(dst+7, m1D);
569 m1A = _mm_or_si128(m1A, m1B);
570 m1C = _mm_or_si128(m1C, m1D);
571 m1A = _mm_or_si128(m1A, m1C);
573 bool z2 = _mm_testz_si128(m1A, m1A);
591 __m128i m1A, m1B, m1C, m1D;
593 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
594 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
595 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
596 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
598 _mm_store_si128(dst+0, m1A);
599 _mm_store_si128(dst+1, m1B);
600 _mm_store_si128(dst+2, m1C);
601 _mm_store_si128(dst+3, m1D);
603 m1A = _mm_or_si128(m1A, m1B);
604 m1C = _mm_or_si128(m1C, m1D);
605 m1A = _mm_or_si128(m1A, m1C);
607 bool z1 = _mm_testz_si128(m1A, m1A);
609 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
610 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
611 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
612 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
614 _mm_store_si128(dst+4, m1A);
615 _mm_store_si128(dst+5, m1B);
616 _mm_store_si128(dst+6, m1C);
617 _mm_store_si128(dst+7, m1D);
619 m1A = _mm_or_si128(m1A, m1B);
620 m1C = _mm_or_si128(m1C, m1D);
621 m1A = _mm_or_si128(m1A, m1C);
623 bool z2 = _mm_testz_si128(m1A, m1A);
643 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
644 if (!_mm_test_all_ones(w))
646 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
647 if (!_mm_test_all_ones(w))
651 }
while (block < block_end);
662 return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
673 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
674 return _mm_testz_si128(w0, w0);
684 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
685 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
686 w0 = _mm_or_si128(w0, w1);
687 return _mm_testz_si128(w0, w0);
697 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
698 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
699 w0 = _mm_xor_si128(w0, w1);
700 return _mm_testz_si128(w0, w0);
714 const __m128i* block_end =
716 __m128i m1COshft, m2COshft;
721 unsigned co2, co1 = 0;
722 for (;block < block_end; block += 2)
724 __m128i m1A = _mm_load_si128(block);
725 __m128i m2A = _mm_load_si128(block+1);
727 __m128i m1CO = _mm_srli_epi32(m1A, 31);
728 __m128i m2CO = _mm_srli_epi32(m2A, 31);
730 co2 = _mm_extract_epi32(m1CO, 3);
732 __m128i m1As = _mm_slli_epi32(m1A, 1);
733 __m128i m2As = _mm_slli_epi32(m2A, 1);
735 m1COshft = _mm_slli_si128 (m1CO, 4);
736 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
740 co2 = _mm_extract_epi32(m2CO, 3);
742 m2COshft = _mm_slli_si128 (m2CO, 4);
743 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
745 m1As = _mm_or_si128(m1As, m1COshft);
746 m2As = _mm_or_si128(m2As, m2COshft);
751 m1A = _mm_xor_si128(m1A, m1As);
752 m2A = _mm_xor_si128(m2A, m2As);
755 _mm_store_si128((__m128i*)tcnt, m1A);
756 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
757 _mm_store_si128((__m128i*)tcnt, m2A);
758 count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
760 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
761 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
762 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
763 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
764 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
765 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
767 m0 = _mm_extract_epi32(m2A, 0);
768 m1 = _mm_extract_epi32(m2A, 1);
769 m2 = _mm_extract_epi32(m2A, 2);
770 m3 = _mm_extract_epi32(m2A, 3);
771 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
772 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
797 const __m128i* block_end =
799 __m128i m1COshft, m2COshft;
802 unsigned gap_count = 1;
803 unsigned bit_count = 0;
805 unsigned co2, co1 = 0;
806 for (;block < block_end; block += 2, xor_block += 2)
808 __m128i m1A = _mm_load_si128(block);
809 __m128i m2A = _mm_load_si128(block+1);
810 __m128i m1B = _mm_load_si128(xor_block);
811 __m128i m2B = _mm_load_si128(xor_block+1);
813 m1A = _mm_xor_si128(m1A, m1B);
814 m2A = _mm_xor_si128(m2A, m2B);
818 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
819 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
820 bit_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
821 bit_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
823 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
824 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
825 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
826 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
827 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
828 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
830 m0 = _mm_extract_epi32(m2A, 0);
831 m1 = _mm_extract_epi32(m2A, 1);
832 m2 = _mm_extract_epi32(m2A, 2);
833 m3 = _mm_extract_epi32(m2A, 3);
834 bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
835 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
839 __m128i m1CO = _mm_srli_epi32(m1A, 31);
840 __m128i m2CO = _mm_srli_epi32(m2A, 31);
842 co2 = _mm_extract_epi32(m1CO, 3);
844 __m128i m1As = _mm_slli_epi32(m1A, 1);
845 __m128i m2As = _mm_slli_epi32(m2A, 1);
847 m1COshft = _mm_slli_si128 (m1CO, 4);
848 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
852 co2 = _mm_extract_epi32(m2CO, 3);
854 m2COshft = _mm_slli_si128 (m2CO, 4);
855 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
857 m1As = _mm_or_si128(m1As, m1COshft);
858 m2As = _mm_or_si128(m2As, m2COshft);
863 m1A = _mm_xor_si128(m1A, m1As);
864 m2A = _mm_xor_si128(m2A, m2As);
867 _mm_store_si128 ((__m128i*)simd_buf0, m1A);
868 _mm_store_si128 ((__m128i*)simd_buf1, m2A);
869 gap_count += unsigned(_mm_popcnt_u64(simd_buf0[0]) + _mm_popcnt_u64(simd_buf0[1]));
870 gap_count += unsigned(_mm_popcnt_u64(simd_buf1[0]) + _mm_popcnt_u64(simd_buf1[1]));
872 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
873 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
874 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
875 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
876 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
877 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
879 m0 = _mm_extract_epi32(m2A, 0);
880 m1 = _mm_extract_epi32(m2A, 1);
881 m2 = _mm_extract_epi32(m2A, 2);
882 m3 = _mm_extract_epi32(m2A, 3);
883 gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
884 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
888 gap_count -= (w0 & 1u);
907 const __m128i* block_end =
909 __m128i m1COshft, m2COshft;
912 unsigned bit_count = 0;
913 unsigned gap_count = 1;
915 unsigned co2, co1 = 0;
916 for (;block < block_end; block += 2)
918 __m128i m1A = _mm_load_si128(block);
919 __m128i m2A = _mm_load_si128(block+1);
923 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
924 m0 = _mm_extract_epi64(m2A, 0);
925 m1 = _mm_extract_epi64(m2A, 1);
926 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
929 __m128i m1CO = _mm_srli_epi32(m1A, 31);
930 __m128i m2CO = _mm_srli_epi32(m2A, 31);
932 co2 = _mm_extract_epi32(m1CO, 3);
934 __m128i m1As = _mm_slli_epi32(m1A, 1);
935 __m128i m2As = _mm_slli_epi32(m2A, 1);
937 m1COshft = _mm_slli_si128 (m1CO, 4);
938 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
942 co2 = _mm_extract_epi32(m2CO, 3);
944 m2COshft = _mm_slli_si128 (m2CO, 4);
945 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
947 m1As = _mm_or_si128(m1As, m1COshft);
948 m2As = _mm_or_si128(m2As, m2COshft);
953 m1A = _mm_xor_si128(m1A, m1As);
954 m2A = _mm_xor_si128(m2A, m2As);
958 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
963 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
966 gap_count -= (w0 & 1u);
985 const __m128i* block1_end =
987 const __m128i maskZ = _mm_setzero_si128();
989 unsigned simd_lane = 0;
992 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
993 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
994 __m128i mOR = _mm_or_si128(mA, mB);
995 if (!_mm_test_all_zeros(mOR, mOR))
997 if (!_mm_test_all_zeros(mA, mA))
999 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1003 _mm_store_si128 ((__m128i*)simd_buf, mA);
1004 unsigned widx = bsf >> 2;
1005 unsigned w = simd_buf[widx];
1007 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1010 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1014 _mm_store_si128 ((__m128i*)simd_buf, mB);
1015 unsigned widx = bsf >> 2;
1016 unsigned w = simd_buf[widx];
1018 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1023 block1+=2; block2+=2;
1025 }
while (block1 < block1_end);
1040 const __m128i* block_end =
1042 const __m128i maskZ = _mm_setzero_si128();
1044 unsigned simd_lane = 0;
1047 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
1048 __m128i mOR = _mm_or_si128(mA, mB);
1049 if (!_mm_test_all_zeros(mOR, mOR))
1051 if (!_mm_test_all_zeros(mA, mA))
1053 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1057 _mm_store_si128 ((__m128i*)simd_buf, mA);
1058 unsigned widx = bsf >> 2;
1059 unsigned w = simd_buf[widx];
1061 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1064 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1068 _mm_store_si128 ((__m128i*)simd_buf, mB);
1069 unsigned widx = bsf >> 2;
1070 unsigned w = simd_buf[widx];
1072 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1079 }
while (block < block_end);
1088#pragma GCC diagnostic push
1089#pragma GCC diagnostic ignored "-Warray-bounds"
1105 const unsigned unroll_factor = 8;
1109 for (j = 0; j < size; ++j)
1117 __m128i m1, mz, maskF, maskFL;
1119 mz = _mm_setzero_si128();
1120 m1 = _mm_loadu_si128((__m128i*)(pbuf));
1122 maskF = _mm_cmpeq_epi64(mz, mz);
1123 maskFL = _mm_slli_si128(maskF, 4 * 2);
1124 int shiftL= (64 - (unroll_factor - size) * 16);
1125 maskFL = _mm_slli_epi64(maskFL, shiftL);
1127 m1 = _mm_andnot_si128(maskFL, m1);
1128 m1 = _mm_or_si128(m1, maskFL);
1130 __m128i mp = _mm_set1_epi16(pos);
1131 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1132 __m128i c_mask = _mm_slli_epi16(mge_mask, 15);
1133 int mi = _mm_movemask_epi8(c_mask);
1134 if (
unsigned bc = _mm_popcnt_u32(mi))
1135 return unroll_factor - bc;
1141 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
1142 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1143 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1144 unsigned bc = _mm_popcnt_u32(mi);
1164 unsigned end = 1 + ((*buf) >> 3);
1165 unsigned dsize = end - start;
1170 *is_set = ((*buf) & 1) ^ (start & 1);
1172 BM_ASSERT(buf[start] < pos || (start==0));
1176 unsigned arr_end = end;
1177 while (start != end)
1179 unsigned curr = (start + end) >> 1;
1180 if (buf[curr] < pos)
1185 unsigned size = end - start;
1188 size += (end != arr_end);
1194 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1199 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1230 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1231 __m128i mm_val = _mm_set1_epi32(value);
1233 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8);
1234 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8);
1236 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1237 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1239 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1240 int mask = _mm_movemask_epi8(cmp_mask_ge);
1259 const unsigned unroll_factor = 8;
1260 const unsigned len = (size - start);
1261 const unsigned len_unr = len - (len % unroll_factor);
1266 __m128i nbM = _mm_set1_epi32(nb);
1268 for (k = 0; k < len_unr; k+=unroll_factor)
1270 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1271 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1275 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1276 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1280 for (; k < len; ++k)
1297 const unsigned unroll_factor = 4;
1298 const unsigned len = (stop - start);
1299 const unsigned len_unr = len - (len % unroll_factor);
1310 for (; k < len_unr; k+=unroll_factor)
1312 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1313 __m128i nbitA = _mm_and_si128 (idxA, sb_mask);
1317 nbitA = _mm_and_si128 (nbitA, sw_mask);
1318 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1322 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0);
1323 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA);
1324 if (_mm_test_all_ones(cmpA))
1326 unsigned nword = _mm_extract_epi32(nwordA, 0);
1327 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1328 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1332 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1334 block[mword_v[0]] |= (1u << mshift_v[0]);
1335 block[mword_v[1]] |= (1u << mshift_v[1]);
1336 block[mword_v[2]] |= (1u << mshift_v[2]);
1337 block[mword_v[3]] |= (1u << mshift_v[3]);
1342 for (; k < len; ++k)
1344 unsigned n = idx[k];
1348 block[nword] |= (1u << nbit);
1384 const unsigned unroll_factor = 4;
1385 const unsigned len = (size - start);
1386 const unsigned len_unr = len - (len % unroll_factor);
1390 __m128i maskFF = _mm_set1_epi32(~0u);
1391 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1393 __m128i mask_tmp, mask_0;
1399 unsigned base = start + k;
1400 __m128i* idx_ptr = (__m128i*)(idx + base);
1401 __m128i* target_ptr = (__m128i*)(arr + base);
1402 for (; k < len_unr; k+=unroll_factor)
1404 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask);
1407 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1408 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1416 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1417 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1418 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1419 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1420 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1422 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1423 _mm_slli_si128 (am_0, 8));
1424 mask_tmp = _mm_and_si128 (
1425 _mm_slli_epi32(mask1, mshift_v[3]),
1426 _mm_slli_si128 (am_0, 12)
1429 mask_0 = _mm_or_si128 (mask_0,
1430 _mm_or_si128 (mask_2, mask_tmp));
1433 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1438 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1439 blk[mword_v[1]], blk[mword_v[0]]),
1444 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ);
1445 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF);
1446 mask_tmp = _mm_srli_epi32 (mask_tmp, 31);
1448 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx);
1450 _mm_storeu_si128 (target_ptr,
1451 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1453 ++idx_ptr; ++target_ptr;
1454 _mm_prefetch((
const char*)target_ptr, _MM_HINT_T0);
1457 for (; k < len; ++k)
1473 __m128i* block_end =
1475 __m128i mAcc = _mm_set1_epi32(0);
1476 __m128i mMask1 = _mm_set1_epi32(1);
1479 for (--block_end; block_end >= block; block_end -= 2)
1481 __m128i m1A = _mm_load_si128(block_end);
1482 __m128i m2A = _mm_load_si128(block_end-1);
1484 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1485 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1487 co2 = _mm_extract_epi32(m1CO, 0);
1489 m1A = _mm_srli_epi32(m1A, 1);
1490 m2A = _mm_srli_epi32(m2A, 1);
1492 __m128i m1COshft = _mm_srli_si128 (m1CO, 4);
1493 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1494 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1495 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1496 m1COshft = _mm_slli_epi32(m1COshft, 31);
1497 m2COshft = _mm_slli_epi32(m2COshft, 31);
1499 m1A = _mm_or_si128(m1A, m1COshft);
1500 m2A = _mm_or_si128(m2A, m2COshft);
1502 co1 = _mm_extract_epi32(m2CO, 0);
1504 _mm_store_si128(block_end, m1A);
1505 _mm_store_si128(block_end-1, m2A);
1507 mAcc = _mm_or_si128(mAcc, m1A);
1508 mAcc = _mm_or_si128(mAcc, m2A);
1511 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1523 __m128i* block_end =
1525 __m128i m1COshft, m2COshft;
1526 __m128i mAcc = _mm_set1_epi32(0);
1529 for (;block < block_end; block += 2)
1531 __m128i m1A = _mm_load_si128(block);
1532 __m128i m2A = _mm_load_si128(block+1);
1534 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1535 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1537 co2 = _mm_extract_epi32(m1CO, 3);
1539 m1A = _mm_slli_epi32(m1A, 1);
1540 m2A = _mm_slli_epi32(m2A, 1);
1542 m1COshft = _mm_slli_si128 (m1CO, 4);
1543 m2COshft = _mm_slli_si128 (m2CO, 4);
1544 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1545 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1547 m1A = _mm_or_si128(m1A, m1COshft);
1548 m2A = _mm_or_si128(m2A, m2COshft);
1550 co1 = _mm_extract_epi32(m2CO, 3);
1552 _mm_store_si128(block, m1A);
1553 _mm_store_si128(block+1, m2A);
1555 mAcc = _mm_or_si128(mAcc, m1A);
1556 mAcc = _mm_or_si128(mAcc, m2A);
1558 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1579 __m128i m1COshft, m2COshft;
1580 __m128i mAcc = _mm_set1_epi32(0);
1591 di = unsigned(_mm_popcnt_u64(t - 1));
1598 di += unsigned(_mm_popcnt_u32(t32 - 1));
1602 for (; di < 64 ; ++di)
1608 block = (__m128i*) &wblock[d_base];
1609 mask_block = (__m128i*) &mblock[d_base];
1610 mAcc = _mm_xor_si128(mAcc, mAcc);
1611 for (
unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1613 __m128i m1A = _mm_load_si128(block);
1614 __m128i m2A = _mm_load_si128(block+1);
1616 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1617 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1619 co2 = _mm_extract_epi32(m1CO, 3);
1621 m1A = _mm_slli_epi32(m1A, 1);
1622 m2A = _mm_slli_epi32(m2A, 1);
1624 m1COshft = _mm_slli_si128 (m1CO, 4);
1625 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1629 co2 = _mm_extract_epi32(m2CO, 3);
1631 m2COshft = _mm_slli_si128 (m2CO, 4);
1632 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1634 m1A = _mm_or_si128(m1A, m1COshft);
1635 m2A = _mm_or_si128(m2A, m2COshft);
1637 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block));
1638 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1));
1640 mAcc = _mm_or_si128(mAcc, m1A);
1641 mAcc = _mm_or_si128(mAcc, m2A);
1643 _mm_store_si128(block, m1A);
1644 _mm_store_si128(block+1, m2A);
1650 if (_mm_testz_si128(mAcc, mAcc))
1661 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1662 d |= (dmask & (w0 << di));
1694 const __m128i* sub_block = (__m128i*) (block + off);
1695 __m128i* t_sub_block = (__m128i*)(target_block + off);
1699 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
1700 __m128i mA, mB, mC, mD;
1701 mA = _mm_xor_si128(_mm_load_si128(sub_block),
1702 _mm_load_si128(xor_sub_block));
1703 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1704 _mm_load_si128(xor_sub_block+1));
1705 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1706 _mm_load_si128(xor_sub_block+2));
1707 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1708 _mm_load_si128(xor_sub_block+3));
1710 _mm_store_si128(t_sub_block, mA);
1711 _mm_store_si128(t_sub_block+1, mB);
1712 _mm_store_si128(t_sub_block+2, mC);
1713 _mm_store_si128(t_sub_block+3, mD);
1715 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1716 _mm_load_si128(xor_sub_block+4));
1717 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1718 _mm_load_si128(xor_sub_block+5));
1719 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1720 _mm_load_si128(xor_sub_block+6));
1721 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1722 _mm_load_si128(xor_sub_block+7));
1724 _mm_store_si128(t_sub_block+4, mA);
1725 _mm_store_si128(t_sub_block+5, mB);
1726 _mm_store_si128(t_sub_block+6, mC);
1727 _mm_store_si128(t_sub_block+7, mD);
1732 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
1733 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
1734 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
1735 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
1737 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
1738 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
1739 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
1740 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
1763 unsigned wave = unsigned(_mm_popcnt_u64(t - 1));
1766 const __m128i* sub_block = (
const __m128i*) (xor_block + off);
1767 __m128i* t_sub_block = (__m128i*)(target_block + off);
1769 __m128i mA, mB, mC, mD;
1770 mA = _mm_xor_si128(_mm_load_si128(sub_block),
1771 _mm_load_si128(t_sub_block));
1772 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1773 _mm_load_si128(t_sub_block+1));
1774 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1775 _mm_load_si128(t_sub_block+2));
1776 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1777 _mm_load_si128(t_sub_block+3));
1779 _mm_store_si128(t_sub_block, mA);
1780 _mm_store_si128(t_sub_block+1, mB);
1781 _mm_store_si128(t_sub_block+2, mC);
1782 _mm_store_si128(t_sub_block+3, mD);
1784 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1785 _mm_load_si128(t_sub_block+4));
1786 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1787 _mm_load_si128(t_sub_block+5));
1788 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1789 _mm_load_si128(t_sub_block+6));
1790 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1791 _mm_load_si128(t_sub_block+7));
1793 _mm_store_si128(t_sub_block+4, mA);
1794 _mm_store_si128(t_sub_block+5, mB);
1795 _mm_store_si128(t_sub_block+6, mC);
1796 _mm_store_si128(t_sub_block+7, mD);
1804#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1805 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1807#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1808 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1810#define VECT_BITCOUNT(first, last) \
1811 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1813#define VECT_BITCOUNT_AND(first, last, mask) \
1814 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1816#define VECT_BITCOUNT_OR(first, last, mask) \
1817 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1819#define VECT_BITCOUNT_XOR(first, last, mask) \
1820 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1822#define VECT_BITCOUNT_SUB(first, last, mask) \
1823 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1825#define VECT_INVERT_BLOCK(first) \
1826 sse2_invert_block((__m128i*)first);
1828#define VECT_AND_BLOCK(dst, src) \
1829 sse4_and_block((__m128i*) dst, (__m128i*) (src))
1831#define VECT_AND_DIGEST(dst, src) \
1832 sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1834#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1835 sse4_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1837#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1838 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1840#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1841 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1843#define VECT_OR_BLOCK(dst, src) \
1844 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1846#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1847 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1849#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1850 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1852#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1853 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1855#define VECT_SUB_BLOCK(dst, src) \
1856 sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1858#define VECT_SUB_DIGEST(dst, src) \
1859 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1861#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1862 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1864#define VECT_XOR_BLOCK(dst, src) \
1865 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1867#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1868 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1870#define VECT_COPY_BLOCK(dst, src) \
1871 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1873#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1874 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1876#define VECT_STREAM_BLOCK(dst, src) \
1877 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1879#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1880 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1882#define VECT_SET_BLOCK(dst, value) \
1883 sse2_set_block((__m128i*) dst, value)
1885#define VECT_IS_ZERO_BLOCK(dst) \
1886 sse4_is_all_zero((__m128i*) dst)
1888#define VECT_IS_ONE_BLOCK(dst) \
1889 sse4_is_all_one((__m128i*) dst)
1891#define VECT_IS_DIGEST_ZERO(start) \
1892 sse4_is_digest_zero((__m128i*)start)
1894#define VECT_BLOCK_SET_DIGEST(dst, val) \
1895 sse4_block_set_digest((__m128i*)dst, val)
1897#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1898 sse2_lower_bound_scan_u32(arr, target, from, to)
1900#define VECT_SHIFT_L1(b, acc, co) \
1901 sse42_shift_l1((__m128i*)b, acc, co)
1903#define VECT_SHIFT_R1(b, acc, co) \
1904 sse42_shift_r1((__m128i*)b, acc, co)
1906#define VECT_SHIFT_R1_AND(b, co, m, digest) \
1907 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1909#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1910 sse42_idx_arr_block_lookup(idx, size, nb, start)
1912#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1913 sse42_set_block_bits(block, idx, start, stop)
1915#define VECT_BLOCK_CHANGE(block, size) \
1916 sse42_bit_block_calc_change((__m128i*)block, size)
1918#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
1919 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size, gc, bc)
1922#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
1923 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
1926#define VECT_BIT_FIND_FIRST(src, pos) \
1927 sse42_bit_find_first((__m128i*) src, pos)
1929#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1930 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1932#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1933 sse42_bit_block_xor(t, src, src_xor, d)
1935#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1936 sse42_bit_block_xor_2way(t, src_xor, d)
1939#define VECT_GAP_BFIND(buf, pos, is_set) \
1940 sse42_gap_bfind(buf, pos, is_set)
1943#pragma GCC diagnostic pop
1952#pragma warning( pop )
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if wave of 2 pointers are the same (null or FULL)
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr) BMNOEXCEPT
check if wave of pointers is all NULL
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block, unsigned size) BMNOEXCEPT
bool sse42_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
void sse42_bit_block_calc_xor_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT xor_block, unsigned size, unsigned *BMRESTRICT gc, unsigned *BMRESTRICT bc) BMNOEXCEPT
int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
BMFORCEINLINE bool sse4_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
BMFORCEINLINE bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
bool sse4_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end) BMNOEXCEPT
BMFORCEINLINE bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse4_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
BMFORCEINLINE bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
BMFORCEINLINE bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
unsigned sse42_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos) BMNOEXCEPT
Hybrid binary search, starts as binary, then switches to scan.
unsigned sse42_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set) BMNOEXCEPT
Hybrid binary search, starts as binary, then switches to linear scan.
bool sse42_bit_find_first(const __m128i *BMRESTRICT block, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
BMFORCEINLINE void sse4_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest) BMNOEXCEPT
block shift right by 1 plus AND
bool sse4_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1) BMNOEXCEPT
check if 2 waves of pointers are all NULL
void sse42_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
BMFORCEINLINE bool sse42_test_all_one_wave(const void *ptr) BMNOEXCEPT
check if SSE wave is all oxFFFF...FFF
void sse42_bit_block_calc_change_bc(const __m128i *BMRESTRICT block, unsigned *gc, unsigned *bc) BMNOEXCEPT
const unsigned set_block_digest_wave_size
const unsigned set_block_mask
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx) BMNOEXCEPT
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func) BMNOEXCEPT
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
const unsigned set_word_shift
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop) BMNOEXCEPT
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start) BMNOEXCEPT
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
unsigned short gap_word_t
const unsigned set_block_shift
const unsigned set_word_mask
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT