1#ifndef BMSSE2__H__INCLUDED__
2#define BMSSE2__H__INCLUDED__
25#if !defined(__arm64__) && !defined(__arm__)
38#pragma GCC diagnostic push
39#pragma GCC diagnostic ignored "-Wconversion"
68 const unsigned mu1 = 0x55555555;
69 const unsigned mu2 = 0x33333333;
70 const unsigned mu3 = 0x0F0F0F0F;
71 const unsigned mu4 = 0x0000003F;
74 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
75 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
76 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
77 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
79 mcnt = _mm_xor_si128(m1, m1);
84 __m128i b = _mm_load_si128(block);
88 tmp1 = _mm_srli_epi32(b, 1);
89 tmp1 = _mm_and_si128(tmp1, m1);
90 tmp2 = _mm_and_si128(b, m1);
91 b = _mm_add_epi32(tmp1, tmp2);
94 tmp1 = _mm_srli_epi32(b, 2);
95 tmp1 = _mm_and_si128(tmp1, m2);
96 tmp2 = _mm_and_si128(b, m2);
97 b = _mm_add_epi32(tmp1, tmp2);
100 tmp1 = _mm_srli_epi32(b, 4);
101 b = _mm_add_epi32(b, tmp1);
102 b = _mm_and_si128(b, m3);
105 tmp1 = _mm_srli_epi32 (b, 8);
106 b = _mm_add_epi32(b, tmp1);
109 tmp1 = _mm_srli_epi32 (b, 16);
110 b = _mm_add_epi32(b, tmp1);
111 b = _mm_and_si128(b, m4);
113 mcnt = _mm_add_epi32(mcnt, b);
115 }
while (block < block_end);
119 _mm_store_si128((__m128i*)tcnt, mcnt);
121 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
132 const unsigned mu1 = 0x55555555;
133 const unsigned mu2 = 0x33333333;
134 const unsigned mu3 = 0x0F0F0F0F;
135 const unsigned mu4 = 0x0000003F;
138 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
139 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
140 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
141 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
143 mcnt = _mm_xor_si128(m1, m1);
147 __m128i b = _mm_load_si128(block++);
149 tmp1 = _mm_load_si128(mask_block++);
151 b = sse2_func(b, tmp1);
154 tmp1 = _mm_srli_epi32(b, 1);
155 tmp1 = _mm_and_si128(tmp1, m1);
156 tmp2 = _mm_and_si128(b, m1);
157 b = _mm_add_epi32(tmp1, tmp2);
160 tmp1 = _mm_srli_epi32(b, 2);
161 tmp1 = _mm_and_si128(tmp1, m2);
162 tmp2 = _mm_and_si128(b, m2);
163 b = _mm_add_epi32(tmp1, tmp2);
166 tmp1 = _mm_srli_epi32(b, 4);
167 b = _mm_add_epi32(b, tmp1);
168 b = _mm_and_si128(b, m3);
171 tmp1 = _mm_srli_epi32 (b, 8);
172 b = _mm_add_epi32(b, tmp1);
175 tmp1 = _mm_srli_epi32 (b, 16);
176 b = _mm_add_epi32(b, tmp1);
177 b = _mm_and_si128(b, m4);
179 mcnt = _mm_add_epi32(mcnt, b);
181 }
while (block < block_end);
184 _mm_store_si128((__m128i*)tcnt, mcnt);
186 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
197 const __m128i maskz = _mm_setzero_si128();
203 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
204 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
205 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
206 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
207 if (m1 != 0xFFFF || m2 != 0xFFFF)
210 }
while (block < block_end);
222 const __m128i mask1 = _mm_set_epi32 (~0u, ~0u, ~0u, ~0u);
228 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
229 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
230 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
231 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
232 if (m1 != 0xFFFF || m2 != 0xFFFF)
235 }
while (block < block_end);
246 const __m128i maskz = _mm_setzero_si128();
248 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
249 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
250 wA = _mm_or_si128(wA, wB);
251 auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
253 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
254 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
255 wA = _mm_or_si128(wA, wB);
256 auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
258 if (m1 != 0xFFFF || m2 != 0xFFFF)
270 __m128i mV = _mm_set1_epi32(
int(value));
271 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
272 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
273 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
274 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
298 const __m128i* sub_block = (__m128i*) (block + off);
299 __m128i* t_sub_block = (__m128i*)(target_block + off);
303 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
304 __m128i mA, mB, mC, mD;
305 mA = _mm_xor_si128(_mm_load_si128(sub_block),
306 _mm_load_si128(xor_sub_block));
307 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
308 _mm_load_si128(xor_sub_block+1));
309 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
310 _mm_load_si128(xor_sub_block+2));
311 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
312 _mm_load_si128(xor_sub_block+3));
314 _mm_store_si128(t_sub_block, mA);
315 _mm_store_si128(t_sub_block+1, mB);
316 _mm_store_si128(t_sub_block+2, mC);
317 _mm_store_si128(t_sub_block+3, mD);
319 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
320 _mm_load_si128(xor_sub_block+4));
321 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
322 _mm_load_si128(xor_sub_block+5));
323 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
324 _mm_load_si128(xor_sub_block+6));
325 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
326 _mm_load_si128(xor_sub_block+7));
328 _mm_store_si128(t_sub_block+4, mA);
329 _mm_store_si128(t_sub_block+5, mB);
330 _mm_store_si128(t_sub_block+6, mC);
331 _mm_store_si128(t_sub_block+7, mD);
336 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
337 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
338 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
339 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
341 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
342 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
343 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
344 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
370 const __m128i* sub_block = (
const __m128i*) (xor_block + off);
371 __m128i* t_sub_block = (__m128i*)(target_block + off);
373 __m128i mA, mB, mC, mD;
374 mA = _mm_xor_si128(_mm_load_si128(sub_block),
375 _mm_load_si128(t_sub_block));
376 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
377 _mm_load_si128(t_sub_block+1));
378 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
379 _mm_load_si128(t_sub_block+2));
380 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
381 _mm_load_si128(t_sub_block+3));
383 _mm_store_si128(t_sub_block, mA);
384 _mm_store_si128(t_sub_block+1, mB);
385 _mm_store_si128(t_sub_block+2, mC);
386 _mm_store_si128(t_sub_block+3, mD);
388 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
389 _mm_load_si128(t_sub_block+4));
390 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
391 _mm_load_si128(t_sub_block+5));
392 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
393 _mm_load_si128(t_sub_block+6));
394 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
395 _mm_load_si128(t_sub_block+7));
397 _mm_store_si128(t_sub_block+4, mA);
398 _mm_store_si128(t_sub_block+5, mB);
399 _mm_store_si128(t_sub_block+6, mC);
400 _mm_store_si128(t_sub_block+7, mD);
418 __m128i m1A, m1B, m1C, m1D;
419 const __m128i maskz = _mm_setzero_si128();
421 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
422 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
423 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
424 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
426 _mm_store_si128(dst+0, m1A);
427 _mm_store_si128(dst+1, m1B);
428 _mm_store_si128(dst+2, m1C);
429 _mm_store_si128(dst+3, m1D);
431 m1A = _mm_or_si128(m1A, m1B);
432 m1C = _mm_or_si128(m1C, m1D);
433 m1A = _mm_or_si128(m1A, m1C);
435 bool z1 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
437 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
438 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
439 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
440 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
442 _mm_store_si128(dst+4, m1A);
443 _mm_store_si128(dst+5, m1B);
444 _mm_store_si128(dst+6, m1C);
445 _mm_store_si128(dst+7, m1D);
447 m1A = _mm_or_si128(m1A, m1B);
448 m1C = _mm_or_si128(m1C, m1D);
449 m1A = _mm_or_si128(m1A, m1C);
451 bool z2 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
468 __m128i m1A, m1B, m1C, m1D;
470 const __m128i maskz = _mm_setzero_si128();
472 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
473 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
474 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
475 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
477 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
478 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
480 m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
481 m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
482 m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
483 m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
485 _mm_store_si128(dst+0, m1A);
486 _mm_store_si128(dst+1, m1B);
487 _mm_store_si128(dst+2, m1C);
488 _mm_store_si128(dst+3, m1D);
491 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
492 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
493 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
494 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
496 mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
497 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
499 m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
500 m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
501 m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
502 m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
504 _mm_store_si128(dst+4, m1A);
505 _mm_store_si128(dst+5, m1B);
506 _mm_store_si128(dst+6, m1C);
507 _mm_store_si128(dst+7, m1D);
525 __m128i m1A, m1B, m1C, m1D;
526 __m128i m1E, m1F, m1G, m1H;
528 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
529 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
530 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
531 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
533 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
534 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
535 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
536 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
538 m1A = _mm_and_si128(m1A, m1E);
539 m1B = _mm_and_si128(m1B, m1F);
540 m1C = _mm_and_si128(m1C, m1G);
541 m1D = _mm_and_si128(m1D, m1H);
543 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
544 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
545 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
546 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
548 _mm_store_si128(dst+0, m1A);
549 _mm_store_si128(dst+1, m1B);
550 _mm_store_si128(dst+2, m1C);
551 _mm_store_si128(dst+3, m1D);
553 m1A = _mm_or_si128(m1A, m1B);
554 m1C = _mm_or_si128(m1C, m1D);
555 m1A = _mm_or_si128(m1A, m1C);
557 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
559 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
560 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
561 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
562 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
564 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
565 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
566 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
567 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
569 m1A = _mm_and_si128(m1A, m1E);
570 m1B = _mm_and_si128(m1B, m1F);
571 m1C = _mm_and_si128(m1C, m1G);
572 m1D = _mm_and_si128(m1D, m1H);
574 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
575 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
576 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
577 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
579 _mm_store_si128(dst+4, m1A);
580 _mm_store_si128(dst+5, m1B);
581 _mm_store_si128(dst+6, m1C);
582 _mm_store_si128(dst+7, m1D);
584 m1A = _mm_or_si128(m1A, m1B);
585 m1C = _mm_or_si128(m1C, m1D);
586 m1A = _mm_or_si128(m1A, m1C);
588 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
606 __m128i m1A, m1B, m1C, m1D;
608 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
609 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
610 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
611 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
613 _mm_store_si128(dst+0, m1A);
614 _mm_store_si128(dst+1, m1B);
615 _mm_store_si128(dst+2, m1C);
616 _mm_store_si128(dst+3, m1D);
618 m1A = _mm_or_si128(m1A, m1B);
619 m1C = _mm_or_si128(m1C, m1D);
620 m1A = _mm_or_si128(m1A, m1C);
622 const __m128i maskz = _mm_setzero_si128();
623 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
625 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
626 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
627 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
628 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
630 _mm_store_si128(dst+4, m1A);
631 _mm_store_si128(dst+5, m1B);
632 _mm_store_si128(dst+6, m1C);
633 _mm_store_si128(dst+7, m1D);
635 m1A = _mm_or_si128(m1A, m1B);
636 m1C = _mm_or_si128(m1C, m1D);
637 m1A = _mm_or_si128(m1A, m1C);
639 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
655 __m128i m1A, m1B, m1C, m1D;
656 const __m128i maskz = _mm_setzero_si128();
658 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
659 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
660 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
661 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
663 _mm_store_si128(dst+0, m1A);
664 _mm_store_si128(dst+1, m1B);
665 _mm_store_si128(dst+2, m1C);
666 _mm_store_si128(dst+3, m1D);
668 m1A = _mm_or_si128(m1A, m1B);
669 m1C = _mm_or_si128(m1C, m1D);
670 m1A = _mm_or_si128(m1A, m1C);
672 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
674 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
675 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
676 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
677 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
679 _mm_store_si128(dst+4, m1A);
680 _mm_store_si128(dst+5, m1B);
681 _mm_store_si128(dst+6, m1C);
682 _mm_store_si128(dst+7, m1D);
684 m1A = _mm_or_si128(m1A, m1B);
685 m1C = _mm_or_si128(m1C, m1D);
686 m1A = _mm_or_si128(m1A, m1C);
688 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
705 __m128i m1A, m1B, m1C, m1D;
706 const __m128i maskz = _mm_setzero_si128();
708 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
709 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
710 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
711 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
713 _mm_store_si128(dst+0, m1A);
714 _mm_store_si128(dst+1, m1B);
715 _mm_store_si128(dst+2, m1C);
716 _mm_store_si128(dst+3, m1D);
718 m1A = _mm_or_si128(m1A, m1B);
719 m1C = _mm_or_si128(m1C, m1D);
720 m1A = _mm_or_si128(m1A, m1C);
722 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
724 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
725 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
726 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
727 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
729 _mm_store_si128(dst+4, m1A);
730 _mm_store_si128(dst+5, m1B);
731 _mm_store_si128(dst+6, m1C);
732 _mm_store_si128(dst+7, m1D);
734 m1A = _mm_or_si128(m1A, m1B);
735 m1C = _mm_or_si128(m1C, m1D);
736 m1A = _mm_or_si128(m1A, m1C);
738 bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
754 const __m128i* block_end =
756 const __m128i maskZ = _mm_setzero_si128();
758 unsigned simd_lane = 0;
762 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
763 __m128i mOR = _mm_or_si128(mA, mB);
764 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
767 z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
770 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
774 _mm_store_si128 ((__m128i*)simd_buf, mA);
775 unsigned widx = bsf >> 2;
776 unsigned w = simd_buf[widx];
778 *pos = (simd_lane * 128) + (widx * 32) + bsf;
781 unsigned mask = (_mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ)));
785 _mm_store_si128 ((__m128i*)simd_buf, mB);
786 unsigned widx = bsf >> 2;
787 unsigned w = simd_buf[widx];
789 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
794 }
while (block < block_end);
810 const __m128i* block1_end =
812 const __m128i maskZ = _mm_setzero_si128();
814 unsigned simd_lane = 0;
817 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
818 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
819 __m128i mOR = _mm_or_si128(mA, mB);
820 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
823 z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
826 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
830 _mm_store_si128 ((__m128i*)simd_buf, mA);
831 unsigned widx = bsf >> 2;
832 unsigned w = simd_buf[widx];
834 *pos = (simd_lane * 128) + (widx * 32) + bsf;
837 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
841 _mm_store_si128 ((__m128i*)simd_buf, mB);
842 unsigned widx = bsf >> 2;
843 unsigned w = simd_buf[widx];
845 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
849 block1+=2; block2+=2;
850 }
while (block1 < block1_end);
872 __m128i m1COshft, m2COshft;
873 __m128i mAcc = _mm_set1_epi32(0);
875 __m128i mMask0 = _mm_set_epi32(-1,-1,-1, 0);
878 for (;block < block_end; block += 2)
880 __m128i m1A = _mm_load_si128(block);
881 __m128i m2A = _mm_load_si128(block+1);
883 __m128i m1CO = _mm_srli_epi32(m1A, 31);
884 __m128i m2CO = _mm_srli_epi32(m2A, 31);
886 co2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m1CO, 0xFF));
888 m1A = _mm_slli_epi32(m1A, 1);
889 m2A = _mm_slli_epi32(m2A, 1);
891 m1COshft = _mm_slli_si128 (m1CO, 4);
892 m2COshft = _mm_slli_si128 (m2CO, 4);
894 m1COshft = _mm_and_si128(m1COshft, mMask0);
895 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(0, 0, 0, co1));
897 m2COshft = _mm_and_si128(m2COshft, mMask0);
898 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(0, 0, 0, co2));
900 m1A = _mm_or_si128(m1A, m1COshft);
901 m2A = _mm_or_si128(m2A, m2COshft);
903 co1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m2CO, 0xFF));
905 _mm_store_si128(block, m1A);
906 _mm_store_si128(block+1, m2A);
908 mAcc = _mm_or_si128(mAcc, m1A);
909 mAcc = _mm_or_si128(mAcc, m2A);
911 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
925 __m128i mAcc = _mm_set1_epi32(0);
926 __m128i mMask1 = _mm_set1_epi32(1);
927 __m128i mMask0 = _mm_set_epi32(0, -1, -1, -1);
930 for (--block_end; block_end >= block; block_end -= 2)
932 __m128i m1A = _mm_load_si128(block_end);
933 __m128i m2A = _mm_load_si128(block_end-1);
935 __m128i m1CO = _mm_and_si128(m1A, mMask1);
936 __m128i m2CO = _mm_and_si128(m2A, mMask1);
938 co2 = _mm_cvtsi128_si32 (m1CO);
940 m1A = _mm_srli_epi32(m1A, 1);
941 m2A = _mm_srli_epi32(m2A, 1);
943 __m128i m1COshft = _mm_srli_si128 (m1CO, 4);
944 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
948 m1COshft = _mm_and_si128(m1COshft, mMask0);
949 m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(co1, 0, 0, 0));
950 m2COshft = _mm_and_si128(m2COshft, mMask0);
951 m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(co2, 0, 0, 0));
954 m1COshft = _mm_slli_epi32(m1COshft, 31);
955 m2COshft = _mm_slli_epi32(m2COshft, 31);
957 m1A = _mm_or_si128(m1A, m1COshft);
958 m2A = _mm_or_si128(m2A, m2COshft);
960 co1 = _mm_cvtsi128_si32 (m2CO);
962 _mm_store_si128(block_end, m1A);
963 _mm_store_si128(block_end-1, m2A);
965 mAcc = _mm_or_si128(mAcc, m1A);
966 mAcc = _mm_or_si128(mAcc, m2A);
969 bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
981 const unsigned mu1 = 0x55555555;
982 const unsigned mu2 = 0x33333333;
983 const unsigned mu3 = 0x0F0F0F0F;
984 const unsigned mu4 = 0x0000003F;
987 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
988 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
989 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
990 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
992 mcnt = _mm_xor_si128(m1, m1);
997 int count = (int)(block_end - block)*4;
1000 const int w_shift =
sizeof(w) * 8 - 1;
1001 bool first_word =
true;
1009 count -= (w_prev = (w0 >> w_shift));
1019 __m128i b = _mm_load_si128(block);
1022 tmp1 = _mm_srli_epi32(b, 1);
1023 tmp2 = _mm_xor_si128(b, tmp1);
1024 _mm_store_si128((__m128i*)tcnt, tmp2);
1032 tmp1 = _mm_and_si128(tmp1, m1);
1033 tmp2 = _mm_and_si128(b, m1);
1034 b = _mm_add_epi32(tmp1, tmp2);
1037 tmp1 = _mm_srli_epi32(b, 2);
1038 tmp1 = _mm_and_si128(tmp1, m2);
1039 tmp2 = _mm_and_si128(b, m2);
1040 b = _mm_add_epi32(tmp1, tmp2);
1043 tmp1 = _mm_srli_epi32(b, 4);
1044 b = _mm_add_epi32(b, tmp1);
1045 b = _mm_and_si128(b, m3);
1048 tmp1 = _mm_srli_epi32 (b, 8);
1049 b = _mm_add_epi32(b, tmp1);
1052 tmp1 = _mm_srli_epi32 (b, 16);
1053 b = _mm_add_epi32(b, tmp1);
1054 b = _mm_and_si128(b, m4);
1056 mcnt = _mm_add_epi32(mcnt, b);
1077 count -= !(w_prev ^ (w0 & 1));
1078 count -= w_prev = (w0 >> w_shift);
1082 count -= !w_prev; w_prev ^= w_prev;
1088 count -= !(w_prev ^ (w0 & 1));
1089 count -= w_prev = (w0 >> w_shift);
1093 count -= !w_prev; w_prev ^= w_prev;
1098 count -= !(w_prev ^ (w0 & 1));
1099 count -= w_prev = (w0 >> w_shift);
1103 count -= !w_prev; w_prev ^= w_prev;
1108 count -= !(w_prev ^ (w0 & 1));
1109 count -= w_prev = (w0 >> w_shift);
1113 count -= !w_prev; w_prev ^= w_prev;
1116 }
while (++block < block_end);
1118 _mm_store_si128((__m128i*)tcnt, mcnt);
1119 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
1121 return unsigned(count);
1126#pragma GCC diagnostic push
1127#pragma GCC diagnostic ignored "-Warray-bounds"
1140 const unsigned unroll_factor = 8;
1144 for (j = 0; j < size; ++j)
1152 __m128i m1, mz, maskF, maskFL;
1154 mz = _mm_setzero_si128();
1155 m1 = _mm_loadu_si128((__m128i*)(pbuf));
1157 maskF = _mm_cmpeq_epi32(mz, mz);
1158 maskFL = _mm_slli_si128(maskF, 4 * 2);
1159 int shiftL = (64 - (unroll_factor - size) * 16);
1160 maskFL = _mm_slli_epi64(maskFL, shiftL);
1162 m1 = _mm_andnot_si128(maskFL, m1);
1163 m1 = _mm_or_si128(m1, maskFL);
1165 __m128i mp = _mm_set1_epi16(pos);
1166 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1167 int mi = _mm_movemask_epi8(mge_mask);
1181 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
1182 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
1183 mi = _mm_movemask_epi8(mge_mask);
1187 return size - (unroll_factor - bsr_i);
1207 unsigned end = 1 + ((*buf) >> 3);
1208 unsigned dsize = end - start;
1213 *is_set = ((*buf) & 1) ^ (start & 1);
1215 BM_ASSERT(buf[start] < pos || (start==0));
1219 unsigned arr_end = end;
1220 while (start != end)
1222 unsigned curr = (start + end) >> 1;
1223 if (buf[curr] < pos)
1228 unsigned size = end - start;
1231 size += (end != arr_end);
1237 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1242 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1262#pragma GCC diagnostic pop
1266#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1267 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1269#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1270 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1272#define VECT_BITCOUNT(first, last) \
1273 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
1275#define VECT_BITCOUNT_AND(first, last, mask) \
1276 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1278#define VECT_BITCOUNT_OR(first, last, mask) \
1279 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1281#define VECT_BITCOUNT_XOR(first, last, mask) \
1282 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1284#define VECT_BITCOUNT_SUB(first, last, mask) \
1285 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1287#define VECT_INVERT_BLOCK(first) \
1288 sse2_invert_block((__m128i*)first);
1290#define VECT_AND_BLOCK(dst, src) \
1291 sse2_and_block((__m128i*) dst, (__m128i*) (src))
1293#define VECT_AND_DIGEST(dst, src) \
1294 sse2_and_digest((__m128i*) dst, (const __m128i*) (src))
1296#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1297 sse2_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1299#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1300 sse2_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1302#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1303 sse2_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1305#define VECT_OR_BLOCK(dst, src) \
1306 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1308#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1309 sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1311#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1312 sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1314#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1315 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1317#define VECT_SUB_BLOCK(dst, src) \
1318 sse2_sub_block((__m128i*) dst, (__m128i*) (src))
1320#define VECT_SUB_DIGEST(dst, src) \
1321 sse2_sub_digest((__m128i*) dst, (const __m128i*) (src))
1323#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1324 sse2_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1326#define VECT_XOR_BLOCK(dst, src) \
1327 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1329#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1330 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1332#define VECT_COPY_BLOCK(dst, src) \
1333 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1335#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1336 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1338#define VECT_STREAM_BLOCK(dst, src) \
1339 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1341#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1342 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1344#define VECT_SET_BLOCK(dst, value) \
1345 sse2_set_block((__m128i*) dst, value)
1347#define VECT_IS_ZERO_BLOCK(dst) \
1348 sse2_is_all_zero((__m128i*) dst)
1350#define VECT_IS_ONE_BLOCK(dst) \
1351 sse2_is_all_one((__m128i*) dst)
1353#define VECT_IS_DIGEST_ZERO(start) \
1354 sse2_is_digest_zero((__m128i*)start)
1356#define VECT_BLOCK_SET_DIGEST(dst, val) \
1357 sse2_block_set_digest((__m128i*)dst, val)
1359#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1360 sse2_lower_bound_scan_u32(arr, target, from, to)
1362#define VECT_SHIFT_R1(b, acc, co) \
1363 sse2_shift_r1((__m128i*)b, acc, co)
1366#define VECT_BIT_FIND_FIRST(src, pos) \
1367 sse2_bit_find_first((__m128i*) src, pos)
1369#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1370 sse2_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1372#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1373 sse2_bit_block_xor(t, src, src_xor, d)
1375#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1376 sse2_bit_block_xor_2way(t, src_xor, d)
1378#define VECT_GAP_BFIND(buf, pos, is_set) \
1379 sse2_gap_bfind(buf, pos, is_set)
1385#pragma GCC diagnostic pop
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
BMFORCEINLINE void sse2_block_set_digest(__m128i *dst, unsigned value) BMNOEXCEPT
set digest stride to 0xFF.. or 0x0 value
BMFORCEINLINE bool sse2_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND block digest stride dst &= *src.
void sse2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse2_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos) BMNOEXCEPT
Find first bit which is different between two bit-blocks.
bool sse2_bit_find_first(const __m128i *BMRESTRICT block, unsigned *pos) BMNOEXCEPT
Find first non-zero bit.
bool sse2_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift right by 1
bool sse2_is_all_one(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all ONE bits
bool sse2_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
AND block digest stride.
unsigned sse2_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
unsigned sse2_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
BMFORCEINLINE bool sse2_and_or_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND-OR block digest stride dst |= *src1 & src2.
BMFORCEINLINE bool sse2_is_digest_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if digest stride is all zero bits
void sse2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) BMNOEXCEPT
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse2_is_all_zero(const __m128i *BMRESTRICT block) BMNOEXCEPT
check if block is all zero bits
BMFORCEINLINE bool sse2_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
AND block digest stride dst = *src1 & src2.
BMFORCEINLINE bool sse2_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SUB (AND NOT) block digest stride dst &= ~*src.
bool sse2_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) BMNOEXCEPT
block shift left by 1
BMFORCEINLINE bool sse2_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
BMFORCEINLINE unsigned word_bitcount64(bm::id64_t x) BMNOEXCEPT
bm::id_t sse2_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
const unsigned set_block_digest_wave_size
unsigned sse2_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
BMFORCEINLINE unsigned bit_scan_forward32(unsigned w) BMNOEXCEPT
BMFORCEINLINE T bit_scan_fwd(T v) BMNOEXCEPT
bm::id_t sse2_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
BMFORCEINLINE unsigned long long bmi_bslr_u64(unsigned long long w) BMNOEXCEPT
unsigned short gap_word_t
BMFORCEINLINE unsigned long long bmi_blsi_u64(unsigned long long w)