1#ifndef BMSSE_UTIL__H__INCLUDED__
2#define BMSSE_UTIL__H__INCLUDED__
35#pragma GCC diagnostic push
36#pragma GCC diagnostic ignored "-Wconversion"
83 __m128i xM = _mm_set1_epi32((
int)mask);
86 _mm_store_si128(dst+0, _mm_xor_si128(_mm_load_si128(src+0), xM));
87 _mm_store_si128(dst+1, _mm_xor_si128(_mm_load_si128(src+1), xM));
88 _mm_store_si128(dst+2, _mm_xor_si128(_mm_load_si128(src+2), xM));
89 _mm_store_si128(dst+3, _mm_xor_si128(_mm_load_si128(src+3), xM));
91 }
while (src < src_end);
107 __m128i xM = _mm_set1_epi32((
int)mask);
110 _mm_store_si128(dst+0, _mm_andnot_si128(_mm_load_si128(src+0), xM));
111 _mm_store_si128(dst+1, _mm_andnot_si128(_mm_load_si128(src+1), xM));
112 _mm_store_si128(dst+2, _mm_andnot_si128(_mm_load_si128(src+2), xM));
113 _mm_store_si128(dst+3, _mm_andnot_si128(_mm_load_si128(src+3), xM));
115 }
while (src < src_end);
129 __m128i m1A, m1B, m1C, m1D;
130 __m128i accA, accB, accC, accD;
134 accA = accB = accC = accD = _mm_setzero_si128();
138 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
139 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
140 _mm_store_si128(dst+0, m1A);
141 _mm_store_si128(dst+1, m1B);
142 accA = _mm_or_si128(accA, m1A);
143 accB = _mm_or_si128(accB, m1B);
145 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
146 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
147 _mm_store_si128(dst+2, m1C);
148 _mm_store_si128(dst+3, m1D);
150 accC = _mm_or_si128(accC, m1C);
151 accD = _mm_or_si128(accD, m1D);
155 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
156 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
157 _mm_store_si128(dst+0, m1A);
158 _mm_store_si128(dst+1, m1B);
159 accA = _mm_or_si128(accA, m1A);
160 accB = _mm_or_si128(accB, m1B);
162 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
163 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
164 _mm_store_si128(dst+2, m1C);
165 _mm_store_si128(dst+3, m1D);
167 accC = _mm_or_si128(accC, m1C);
168 accD = _mm_or_si128(accD, m1D);
171 }
while (src < src_end);
173 accA = _mm_or_si128(accA, accB);
174 accC = _mm_or_si128(accC, accD);
175 accA = _mm_or_si128(accA, accC);
179 _mm_store_si128((__m128i*)macc, accA);
180 return macc[0] | macc[1] | macc[2] | macc[3];
263 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
264 __m128i accA, accB, accC, accD;
266 accA = _mm_setzero_si128();
267 accB = _mm_setzero_si128();
268 accC = _mm_setzero_si128();
269 accD = _mm_setzero_si128();
273 m1A = _mm_loadu_si128(src+0);
274 m2A = _mm_load_si128(dst+0);
275 m1A = _mm_and_si128(m1A, m2A);
276 _mm_store_si128(dst+0, m1A);
277 accA = _mm_or_si128(accA, m1A);
279 m1B = _mm_loadu_si128(src+1);
280 m2B = _mm_load_si128(dst+1);
281 m1B = _mm_and_si128(m1B, m2B);
282 _mm_store_si128(dst+1, m1B);
283 accB = _mm_or_si128(accB, m1B);
285 m1C = _mm_loadu_si128(src+2);
286 m2C = _mm_load_si128(dst+2);
287 m1C = _mm_and_si128(m1C, m2C);
288 _mm_store_si128(dst+2, m1C);
289 accC = _mm_or_si128(accC, m1C);
291 m1D = _mm_loadu_si128(src+3);
292 m2D = _mm_load_si128(dst+3);
293 m1D = _mm_and_si128(m1D, m2D);
294 _mm_store_si128(dst+3, m1D);
295 accD = _mm_or_si128(accD, m1D);
298 }
while (src < src_end);
300 accA = _mm_or_si128(accA, accB);
301 accC = _mm_or_si128(accC, accD);
302 accA = _mm_or_si128(accA, accC);
306 _mm_store_si128((__m128i*)macc, accA);
307 return macc[0] | macc[1] | macc[2] | macc[3];
316 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
317 __m128i accA, accB, accC, accD;
319 accA = _mm_setzero_si128();
320 accB = _mm_setzero_si128();
321 accC = _mm_setzero_si128();
322 accD = _mm_setzero_si128();
326 m1A = _mm_load_si128(src + 0);
327 m2A = _mm_load_si128(dst + 0);
328 m1A = _mm_and_si128(m1A, m2A);
329 _mm_store_si128(dst + 0, m1A);
330 accA = _mm_or_si128(accA, m1A);
332 m1B = _mm_load_si128(src + 1);
333 m2B = _mm_load_si128(dst + 1);
334 m1B = _mm_and_si128(m1B, m2B);
335 _mm_store_si128(dst + 1, m1B);
336 accB = _mm_or_si128(accB, m1B);
338 m1C = _mm_load_si128(src + 2);
339 m2C = _mm_load_si128(dst + 2);
340 m1C = _mm_and_si128(m1C, m2C);
341 _mm_store_si128(dst + 2, m1C);
342 accC = _mm_or_si128(accC, m1C);
344 m1D = _mm_load_si128(src + 3);
345 m2D = _mm_load_si128(dst + 3);
346 m1D = _mm_and_si128(m1D, m2D);
347 _mm_store_si128(dst + 3, m1D);
348 accD = _mm_or_si128(accD, m1D);
351 }
while (src < src_end);
353 accA = _mm_or_si128(accA, accB);
354 accC = _mm_or_si128(accC, accD);
355 accA = _mm_or_si128(accA, accC);
359 _mm_store_si128((__m128i*)macc, accA);
360 return macc[0] | macc[1] | macc[2] | macc[3];
375 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
376 __m128i mAccF0 = _mm_set1_epi32(~0u);
377 __m128i mAccF1 = _mm_set1_epi32(~0u);
383 m1A = _mm_load_si128(src + 0);
384 m2A = _mm_load_si128(dst + 0);
385 m1A = _mm_or_si128(m1A, m2A);
386 _mm_store_si128(dst + 0, m1A);
388 m1B = _mm_load_si128(src + 1);
389 m2B = _mm_load_si128(dst + 1);
390 m1B = _mm_or_si128(m1B, m2B);
391 _mm_store_si128(dst + 1, m1B);
393 m1C = _mm_load_si128(src + 2);
394 m2C = _mm_load_si128(dst + 2);
395 m1C = _mm_or_si128(m1C, m2C);
396 _mm_store_si128(dst + 2, m1C);
398 m1D = _mm_load_si128(src + 3);
399 m2D = _mm_load_si128(dst + 3);
400 m1D = _mm_or_si128(m1D, m2D);
401 _mm_store_si128(dst + 3, m1D);
403 mAccF1 = _mm_and_si128(mAccF1, m1C);
404 mAccF1 = _mm_and_si128(mAccF1, m1D);
405 mAccF0 = _mm_and_si128(mAccF0, m1A);
406 mAccF0 = _mm_and_si128(mAccF0, m1B);
409 }
while (src < src_end);
411 __m128i maskF = _mm_set1_epi32(~0u);
412 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
413 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
414 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
416 return (maskA == 0xFFFFu);
430 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
431 __m128i mAccF0 = _mm_set1_epi32(~0u);
432 __m128i mAccF1 = _mm_set1_epi32(~0u);
435 m1A = _mm_loadu_si128(src + 0);
436 m2A = _mm_load_si128(dst + 0);
437 m1A = _mm_or_si128(m1A, m2A);
438 _mm_store_si128(dst + 0, m1A);
440 m1B = _mm_loadu_si128(src + 1);
441 m2B = _mm_load_si128(dst + 1);
442 m1B = _mm_or_si128(m1B, m2B);
443 _mm_store_si128(dst + 1, m1B);
445 m1C = _mm_loadu_si128(src + 2);
446 m2C = _mm_load_si128(dst + 2);
447 m1C = _mm_or_si128(m1C, m2C);
448 _mm_store_si128(dst + 2, m1C);
450 m1D = _mm_loadu_si128(src + 3);
451 m2D = _mm_load_si128(dst + 3);
452 m1D = _mm_or_si128(m1D, m2D);
453 _mm_store_si128(dst + 3, m1D);
455 mAccF1 = _mm_and_si128(mAccF1, m1C);
456 mAccF1 = _mm_and_si128(mAccF1, m1D);
457 mAccF0 = _mm_and_si128(mAccF0, m1A);
458 mAccF0 = _mm_and_si128(mAccF0, m1B);
461 }
while (src < src_end);
463 __m128i maskF = _mm_set1_epi32(~0u);
464 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
465 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
466 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
467 return (maskA == 0xFFFFu);
482 __m128i m1A, m1B, m1C, m1D;
483 __m128i mAccF0 = _mm_set1_epi32(~0u);
484 __m128i mAccF1 = _mm_set1_epi32(~0u);
490 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
491 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
492 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
493 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
495 _mm_store_si128(dst + 0, m1A);
496 _mm_store_si128(dst + 1, m1B);
497 _mm_store_si128(dst + 2, m1C);
498 _mm_store_si128(dst + 3, m1D);
500 mAccF1 = _mm_and_si128(mAccF1, m1C);
501 mAccF1 = _mm_and_si128(mAccF1, m1D);
502 mAccF0 = _mm_and_si128(mAccF0, m1A);
503 mAccF0 = _mm_and_si128(mAccF0, m1B);
505 src1 += 4; src2 += 4; dst += 4;
507 }
while (src1 < src_end1);
509 __m128i maskF = _mm_set1_epi32(~0u);
510 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
511 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
512 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
513 return (maskA == 0xFFFFu);
528 __m128i m1A, m1B, m1C, m1D;
529 __m128i mAccF0 = _mm_set1_epi32(~0u);
530 __m128i mAccF1 = _mm_set1_epi32(~0u);
536 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
537 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
538 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
539 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
541 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
542 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
543 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
544 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
546 _mm_store_si128(dst + 0, m1A);
547 _mm_store_si128(dst + 1, m1B);
548 _mm_store_si128(dst + 2, m1C);
549 _mm_store_si128(dst + 3, m1D);
551 mAccF1 = _mm_and_si128(mAccF1, m1C);
552 mAccF1 = _mm_and_si128(mAccF1, m1D);
553 mAccF0 = _mm_and_si128(mAccF0, m1A);
554 mAccF0 = _mm_and_si128(mAccF0, m1B);
556 src1 += 4; src2 += 4; dst += 4;
558 }
while (src1 < src_end1);
560 __m128i maskF = _mm_set1_epi32(~0u);
561 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
562 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
563 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
564 return (maskA == 0xFFFFu);
581 __m128i m1A, m1B, m1C, m1D;
582 __m128i mAccF0 = _mm_set1_epi32(~0u);
583 __m128i mAccF1 = _mm_set1_epi32(~0u);
590 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
591 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
592 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
593 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
595 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
596 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
597 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
598 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
600 m1A = _mm_or_si128(m1A, _mm_load_si128(src3 + 0));
601 m1B = _mm_or_si128(m1B, _mm_load_si128(src3 + 1));
602 m1C = _mm_or_si128(m1C, _mm_load_si128(src3 + 2));
603 m1D = _mm_or_si128(m1D, _mm_load_si128(src3 + 3));
605 m1A = _mm_or_si128(m1A, _mm_load_si128(src4 + 0));
606 m1B = _mm_or_si128(m1B, _mm_load_si128(src4 + 1));
607 m1C = _mm_or_si128(m1C, _mm_load_si128(src4 + 2));
608 m1D = _mm_or_si128(m1D, _mm_load_si128(src4 + 3));
610 _mm_stream_si128(dst + 0, m1A);
611 _mm_stream_si128(dst + 1, m1B);
612 _mm_stream_si128(dst + 2, m1C);
613 _mm_stream_si128(dst + 3, m1D);
615 mAccF1 = _mm_and_si128(mAccF1, m1C);
616 mAccF1 = _mm_and_si128(mAccF1, m1D);
617 mAccF0 = _mm_and_si128(mAccF0, m1A);
618 mAccF0 = _mm_and_si128(mAccF0, m1B);
620 src1 += 4; src2 += 4;
621 src3 += 4; src4 += 4;
623 _mm_prefetch ((
const char*)src3, _MM_HINT_T0);
624 _mm_prefetch ((
const char*)src4, _MM_HINT_T0);
628 }
while (src1 < src_end1);
630 __m128i maskF = _mm_set1_epi32(~0u);
631 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
632 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
633 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
634 return (maskA == 0xFFFFu);
649 __m128i m1A, m1B, m1C, m1D;
650 __m128i accA, accB, accC, accD;
655 accA = accB = accC = accD = _mm_setzero_si128();
659 m1A = _mm_xor_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
660 m1B = _mm_xor_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
661 m1C = _mm_xor_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
662 m1D = _mm_xor_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
664 _mm_store_si128(dst+0, m1A);
665 _mm_store_si128(dst+1, m1B);
666 _mm_store_si128(dst+2, m1C);
667 _mm_store_si128(dst+3, m1D);
669 accA = _mm_or_si128(accA, m1A);
670 accB = _mm_or_si128(accB, m1B);
671 accC = _mm_or_si128(accC, m1C);
672 accD = _mm_or_si128(accD, m1D);
675 }
while (src < src_end);
677 accA = _mm_or_si128(accA, accB);
678 accC = _mm_or_si128(accC, accD);
679 accA = _mm_or_si128(accA, accC);
682 _mm_store_si128((__m128i*)macc, accA);
683 return macc[0] | macc[1] | macc[2] | macc[3];
697 __m128i m1A, m1B, m1C, m1D;
698 __m128i accA, accB, accC, accD;
703 accA = accB = accC = accD = _mm_setzero_si128();
707 m1A = _mm_xor_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
708 m1B = _mm_xor_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
709 m1C = _mm_xor_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
710 m1D = _mm_xor_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
712 _mm_store_si128(dst + 0, m1A);
713 _mm_store_si128(dst + 1, m1B);
714 _mm_store_si128(dst + 2, m1C);
715 _mm_store_si128(dst + 3, m1D);
717 accA = _mm_or_si128(accA, m1A);
718 accB = _mm_or_si128(accB, m1B);
719 accC = _mm_or_si128(accC, m1C);
720 accD = _mm_or_si128(accD, m1D);
722 src1 += 4; src2 += 4; dst += 4;
723 }
while (src1 < src1_end);
725 accA = _mm_or_si128(accA, accB);
726 accC = _mm_or_si128(accC, accD);
727 accA = _mm_or_si128(accA, accC);
730 _mm_store_si128((__m128i*)macc, accA);
731 return macc[0] | macc[1] | macc[2] | macc[3];
747 __m128i m1A, m1B, m1C, m1D;
748 __m128i accA, accB, accC, accD;
750 accA = accB = accC = accD = _mm_setzero_si128();
757 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
758 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
759 _mm_store_si128(dst+0, m1A);
760 _mm_store_si128(dst+1, m1B);
761 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
762 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
763 _mm_store_si128(dst+2, m1C);
764 _mm_store_si128(dst+3, m1D);
766 accA = _mm_or_si128(accA, m1A);
767 accB = _mm_or_si128(accB, m1B);
768 accC = _mm_or_si128(accC, m1C);
769 accD = _mm_or_si128(accD, m1D);
773 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
774 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
775 _mm_store_si128(dst+0, m1A);
776 _mm_store_si128(dst+1, m1B);
777 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
778 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
779 _mm_store_si128(dst+2, m1C);
780 _mm_store_si128(dst+3, m1D);
782 accA = _mm_or_si128(accA, m1A);
783 accB = _mm_or_si128(accB, m1B);
784 accC = _mm_or_si128(accC, m1C);
785 accD = _mm_or_si128(accD, m1D);
788 }
while (src < src_end);
790 accA = _mm_or_si128(accA, accB);
791 accC = _mm_or_si128(accC, accD);
792 accA = _mm_or_si128(accA, accC);
796 _mm_store_si128((__m128i*)macc, accA);
797 return macc[0] | macc[1] | macc[2] | macc[3];
814 __m128i xmm0 = _mm_set1_epi32((
int)value);
817 _mm_store_si128(dst, xmm0);
818 _mm_store_si128(dst+1, xmm0);
819 _mm_store_si128(dst+2, xmm0);
820 _mm_store_si128(dst+3, xmm0);
822 _mm_store_si128(dst+4, xmm0);
823 _mm_store_si128(dst+5, xmm0);
824 _mm_store_si128(dst+6, xmm0);
825 _mm_store_si128(dst+7, xmm0);
828 }
while (dst < dst_end);
841 __m128i xmm0, xmm1, xmm2, xmm3;
847 xmm0 = _mm_load_si128(src+0);
848 xmm1 = _mm_load_si128(src+1);
849 xmm2 = _mm_load_si128(src+2);
850 xmm3 = _mm_load_si128(src+3);
852 _mm_store_si128(dst+0, xmm0);
853 _mm_store_si128(dst+1, xmm1);
854 _mm_store_si128(dst+2, xmm2);
855 _mm_store_si128(dst+3, xmm3);
857 xmm0 = _mm_load_si128(src+4);
858 xmm1 = _mm_load_si128(src+5);
859 xmm2 = _mm_load_si128(src+6);
860 xmm3 = _mm_load_si128(src+7);
862 _mm_store_si128(dst+4, xmm0);
863 _mm_store_si128(dst+5, xmm1);
864 _mm_store_si128(dst+6, xmm2);
865 _mm_store_si128(dst+7, xmm3);
869 }
while (src < src_end);
882 __m128i xmm0, xmm1, xmm2, xmm3;
888 xmm0 = _mm_loadu_si128(src+0);
889 xmm1 = _mm_loadu_si128(src+1);
890 xmm2 = _mm_loadu_si128(src+2);
891 xmm3 = _mm_loadu_si128(src+3);
893 _mm_store_si128(dst+0, xmm0);
894 _mm_store_si128(dst+1, xmm1);
895 _mm_store_si128(dst+2, xmm2);
896 _mm_store_si128(dst+3, xmm3);
898 xmm0 = _mm_loadu_si128(src+4);
899 xmm1 = _mm_loadu_si128(src+5);
900 xmm2 = _mm_loadu_si128(src+6);
901 xmm3 = _mm_loadu_si128(src+7);
903 _mm_store_si128(dst+4, xmm0);
904 _mm_store_si128(dst+5, xmm1);
905 _mm_store_si128(dst+6, xmm2);
906 _mm_store_si128(dst+7, xmm3);
910 }
while (src < src_end);
924 __m128i xmm0, xmm1, xmm2, xmm3;
930 xmm0 = _mm_load_si128(src+0);
931 xmm1 = _mm_load_si128(src+1);
932 xmm2 = _mm_load_si128(src+2);
933 xmm3 = _mm_load_si128(src+3);
935 _mm_stream_si128(dst+0, xmm0);
936 _mm_stream_si128(dst+1, xmm1);
937 _mm_stream_si128(dst+2, xmm2);
938 _mm_stream_si128(dst+3, xmm3);
940 xmm0 = _mm_load_si128(src+4);
941 xmm1 = _mm_load_si128(src+5);
942 xmm2 = _mm_load_si128(src+6);
943 xmm3 = _mm_load_si128(src+7);
945 _mm_stream_si128(dst+4, xmm0);
946 _mm_stream_si128(dst+5, xmm1);
947 _mm_stream_si128(dst+6, xmm2);
948 _mm_stream_si128(dst+7, xmm3);
952 }
while (src < src_end);
965 __m128i xmm0, xmm1, xmm2, xmm3;
971 xmm0 = _mm_loadu_si128(src+0);
972 xmm1 = _mm_loadu_si128(src+1);
973 xmm2 = _mm_loadu_si128(src+2);
974 xmm3 = _mm_loadu_si128(src+3);
976 _mm_stream_si128(dst+0, xmm0);
977 _mm_stream_si128(dst+1, xmm1);
978 _mm_stream_si128(dst+2, xmm2);
979 _mm_stream_si128(dst+3, xmm3);
981 xmm0 = _mm_loadu_si128(src+4);
982 xmm1 = _mm_loadu_si128(src+5);
983 xmm2 = _mm_loadu_si128(src+6);
984 xmm3 = _mm_loadu_si128(src+7);
986 _mm_stream_si128(dst+4, xmm0);
987 _mm_stream_si128(dst+5, xmm1);
988 _mm_stream_si128(dst+6, xmm2);
989 _mm_stream_si128(dst+7, xmm3);
993 }
while (src < src_end);
1008 __m128i maskF = _mm_set1_epi32(~0u);
1012 __m128i mA, mB, mC, mD;
1015 mA = _mm_load_si128(dst + 0);
1016 mB = _mm_load_si128(dst + 1);
1017 mA = _mm_xor_si128(mA, maskF);
1018 mB = _mm_xor_si128(mB, maskF);
1019 _mm_store_si128(dst, mA);
1020 _mm_store_si128(dst + 1, mB);
1022 mC = _mm_load_si128(dst + 2);
1023 mD = _mm_load_si128(dst + 3);
1024 mC = _mm_xor_si128(mC, maskF);
1025 mD = _mm_xor_si128(mD, maskF);
1026 _mm_store_si128(dst + 2, mC);
1027 _mm_store_si128(dst + 3, mD);
1031 }
while (dst < (__m128i*)dst_end);
1037 return _mm_and_si128(a, b);
1043 return _mm_or_si128(a, b);
1050 return _mm_xor_si128(a, b);
1056 return _mm_andnot_si128(b, a);
1073 unsigned sse_vect_waves,
1076 __m128i xcnt = _mm_setzero_si128();
1078 for (
unsigned i = 0; i < sse_vect_waves; ++i)
1080 __m128i mm0 = _mm_loadu_si128((__m128i*)(pbuf - 1));
1081 __m128i mm1 = _mm_loadu_si128((__m128i*)(pbuf + 8 - 1));
1082 __m128i mm_s2 = _mm_add_epi16(mm1, mm0);
1083 xcnt = _mm_add_epi16(xcnt, mm_s2);
1086 xcnt = _mm_sub_epi16(_mm_srli_epi32(xcnt, 16), xcnt);
1088 unsigned short* cnt8 = (
unsigned short*)&xcnt;
1089 *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
1108 const unsigned*
BMRESTRICT arr_base = &arr[from];
1110 unsigned unroll_factor = 8;
1111 unsigned len = to - from + 1;
1112 unsigned len_unr = len - (len % unroll_factor);
1114 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1115 __m128i vect_target = _mm_set1_epi32(target);
1116 __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8);
1119 __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1122 for (; k < len_unr; k+=unroll_factor)
1124 vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k]));
1125 norm_vect40 = _mm_sub_epi32(vect40, mask0x8);
1127 cmp_mask_ge = _mm_or_si128(
1128 _mm_cmpgt_epi32 (norm_vect40, norm_target),
1129 _mm_cmpeq_epi32 (vect40, vect_target)
1131 mask = _mm_movemask_epi8(cmp_mask_ge);
1135 return from + k + (bsf / 4);
1137 vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
1138 norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
1140 cmp_mask_ge = _mm_or_si128(
1141 _mm_cmpgt_epi32 (norm_vect41, norm_target),
1142 _mm_cmpeq_epi32 (vect41, vect_target)
1144 mask = _mm_movemask_epi8(cmp_mask_ge);
1148 return 4 + from + k + (bsf / 4);
1152 for (; k < len; ++k)
1154 if (arr_base[k] >= target)
1162#pragma GCC diagnostic pop
SSE2 reinitialization guard class.
BMFORCEINLINE sse_empty_guard() BMNOEXCEPT
BMFORCEINLINE ~sse_empty_guard() BMNOEXCEPT
void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
3 operand XOR dst = *src1 ^ src2
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
void sse2_stream_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned src) dst = *src.
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND-NOT (SUB) array elements against another array dst &= ~*src.
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
XOR array elements to specified mask dst = *src ^ mask.
void sse2_copy_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned SRC) dst = *src.
unsigned sse2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to) BMNOEXCEPT
lower bound (great or equal) linear scan in ascending order sorted array
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
XOR block against another dst ^= *src.
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
OR array elements against another array (unaligned) dst |= *src.
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
AND array elements against another array (unaligned) dst &= *src.
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2.
void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
SSE2 block memset dst = value.
void sse2_invert_block(__m128i *BMRESTRICT dst) BMNOEXCEPT
Invert bit block dst = ~*dst or dst ^= *dst.
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
OR array elements against another array dst |= *src.
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
BMFORCEINLINE unsigned bit_scan_forward32(unsigned w) BMNOEXCEPT
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b) BMNOEXCEPT
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b) BMNOEXCEPT
const unsigned set_block_size
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b) BMNOEXCEPT
unsigned short gap_word_t
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b) BMNOEXCEPT
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum) BMNOEXCEPT
Gap block population count (array sum) utility.