BitMagic-C++
bmsse_util.h
Go to the documentation of this file.
1#ifndef BMSSE_UTIL__H__INCLUDED__
2#define BMSSE_UTIL__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse_util.h
22 \brief Compute functions for SSE SIMD instruction set (internal)
23*/
24
25namespace bm
26{
27
28/** @defgroup SSE2 SSE2 functions
29 Processor specific optimizations for SSE2 instructions (internals)
30 @internal
31 @ingroup bvector
32 */
33
34#ifdef __GNUG__
35#pragma GCC diagnostic push
36#pragma GCC diagnostic ignored "-Wconversion"
37#endif
38
39
40/*!
41 @brief SSE2 reinitialization guard class
42
43 SSE2 requires to call _mm_empty() if we are intermixing
44 MMX integer commands with floating point arithmetics.
45 This class guards critical code fragments where SSE2 integer
46 is used.
47
48 As of 2015 _mm_empty() is considered deprecated, and not even recognised
49 by some compilers (like MSVC) in 64-bit mode.
50 As MMX instructions gets old, we here deprecate and comment out
51 use of _mm_empty()
52
53 @ingroup SSE2
54*/
56{
57public:
59 {
60 //_mm_empty();
61 }
62
64 {
65 //_mm_empty();
66 }
67};
68
69
70
71/*!
72 @brief XOR array elements to specified mask
73 *dst = *src ^ mask
74
75 @ingroup SSE2
76*/
77inline
79 const __m128i* BMRESTRICT src,
80 const __m128i* BMRESTRICT src_end,
82{
83 __m128i xM = _mm_set1_epi32((int)mask);
84 do
85 {
86 _mm_store_si128(dst+0, _mm_xor_si128(_mm_load_si128(src+0), xM));
87 _mm_store_si128(dst+1, _mm_xor_si128(_mm_load_si128(src+1), xM));
88 _mm_store_si128(dst+2, _mm_xor_si128(_mm_load_si128(src+2), xM));
89 _mm_store_si128(dst+3, _mm_xor_si128(_mm_load_si128(src+3), xM));
90 dst += 4; src += 4;
91 } while (src < src_end);
92}
93
94
95/*!
96 @brief Inverts array elements and NOT them to specified mask
97 *dst = ~*src & mask
98
99 @ingroup SSE2
100*/
101inline
103 const __m128i* BMRESTRICT src,
104 const __m128i* BMRESTRICT src_end,
106{
107 __m128i xM = _mm_set1_epi32((int)mask);
108 do
109 {
110 _mm_store_si128(dst+0, _mm_andnot_si128(_mm_load_si128(src+0), xM)); // xmm1 = (~xmm1) & xM
111 _mm_store_si128(dst+1, _mm_andnot_si128(_mm_load_si128(src+1), xM));
112 _mm_store_si128(dst+2, _mm_andnot_si128(_mm_load_si128(src+2), xM));
113 _mm_store_si128(dst+3, _mm_andnot_si128(_mm_load_si128(src+3), xM));
114 dst += 4; src += 4;
115 } while (src < src_end);
116}
117
118/*!
119 @brief AND blocks2
120 *dst &= *src
121 @return 0 if no bits were set
122 @ingroup SSE2
123*/
124
125inline
126unsigned sse2_and_block(__m128i* BMRESTRICT dst,
127 const __m128i* BMRESTRICT src) BMNOEXCEPT
128{
129 __m128i m1A, m1B, m1C, m1D;
130 __m128i accA, accB, accC, accD;
131 const __m128i* BMRESTRICT src_end =
132 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
133
134 accA = accB = accC = accD = _mm_setzero_si128();
135
136 do
137 {
138 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
139 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
140 _mm_store_si128(dst+0, m1A);
141 _mm_store_si128(dst+1, m1B);
142 accA = _mm_or_si128(accA, m1A);
143 accB = _mm_or_si128(accB, m1B);
144
145 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
146 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
147 _mm_store_si128(dst+2, m1C);
148 _mm_store_si128(dst+3, m1D);
149
150 accC = _mm_or_si128(accC, m1C);
151 accD = _mm_or_si128(accD, m1D);
152 src += 4; dst += 4;
153
154
155 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
156 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
157 _mm_store_si128(dst+0, m1A);
158 _mm_store_si128(dst+1, m1B);
159 accA = _mm_or_si128(accA, m1A);
160 accB = _mm_or_si128(accB, m1B);
161
162 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
163 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
164 _mm_store_si128(dst+2, m1C);
165 _mm_store_si128(dst+3, m1D);
166
167 accC = _mm_or_si128(accC, m1C);
168 accD = _mm_or_si128(accD, m1D);
169 src += 4; dst += 4;
170
171 } while (src < src_end);
172
173 accA = _mm_or_si128(accA, accB); // A = A | B
174 accC = _mm_or_si128(accC, accD); // C = C | D
175 accA = _mm_or_si128(accA, accC); // A = A | C
176
177
179 _mm_store_si128((__m128i*)macc, accA);
180 return macc[0] | macc[1] | macc[2] | macc[3];
181}
182
183/*
184inline
185unsigned sse2_and_block(__m128i* BMRESTRICT dst,
186 const __m128i* BMRESTRICT src) BMNOEXCEPT
187{
188 __m128i m1A, m1B, m1C, m1D;
189 __m128i accA, accB, accC, accD;
190 const __m128i* BMRESTRICT src_end =
191 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
192
193 const __m128i* BMRESTRICT src2 =
194 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size/2);
195
196 __m128i* BMRESTRICT dst2 =
197 ( __m128i*)((bm::word_t*)(dst) + bm::set_block_size/2);
198
199 accA = accB = accC = accD = _mm_setzero_si128();
200
201 do
202 {
203 m1A = _mm_and_si128(_mm_load_si128(src), _mm_load_si128(dst+0));
204 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
205 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
206 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
207
208 _mm_store_si128(dst, m1A);
209 _mm_store_si128(dst+1, m1B);
210 _mm_store_si128(dst+2, m1C);
211 _mm_store_si128(dst+3, m1D);
212
213 accA = _mm_or_si128(accA, m1A);
214 accB = _mm_or_si128(accB, m1B);
215 accC = _mm_or_si128(accC, m1C);
216 accD = _mm_or_si128(accD, m1D);
217
218 src += 4; dst += 4;
219
220 m1A = _mm_and_si128(_mm_load_si128(src2), _mm_load_si128(dst2));
221 m1B = _mm_and_si128(_mm_load_si128(src2+1), _mm_load_si128(dst2+1));
222 m1C = _mm_and_si128(_mm_load_si128(src2+2), _mm_load_si128(dst2+2));
223 m1D = _mm_and_si128(_mm_load_si128(src2+3), _mm_load_si128(dst2+3));
224
225 _mm_store_si128(dst2, m1A);
226 _mm_store_si128(dst2+1, m1B);
227 _mm_store_si128(dst2+2, m1C);
228 _mm_store_si128(dst2+3, m1D);
229
230 accA = _mm_or_si128(accA, m1A);
231 accB = _mm_or_si128(accB, m1B);
232 accC = _mm_or_si128(accC, m1C);
233 accD = _mm_or_si128(accD, m1D);
234
235
236 src2 += 4; dst2 += 4;
237 } while (src2 < src_end);
238
239 accA = _mm_or_si128(accA, accB); // A = A | B
240 accC = _mm_or_si128(accC, accD); // C = C | D
241 accA = _mm_or_si128(accA, accC); // A = A | C
242
243
244 bm::id_t BM_ALIGN16 macc[4] BM_ALIGN16ATTR;
245 _mm_store_si128((__m128i*)macc, accA);
246 return macc[0] | macc[1] | macc[2] | macc[3];
247}
248*/
249
250/*!
251 @brief AND array elements against another array (unaligned)
252 *dst &= *src
253
254 @return 0 if no bits were set
255
256 @ingroup SSE2
257*/
258inline
259unsigned sse2_and_arr_unal(__m128i* BMRESTRICT dst,
260 const __m128i* BMRESTRICT src,
261 const __m128i* BMRESTRICT src_end) BMNOEXCEPT
262{
263 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
264 __m128i accA, accB, accC, accD;
265
266 accA = _mm_setzero_si128();
267 accB = _mm_setzero_si128();
268 accC = _mm_setzero_si128();
269 accD = _mm_setzero_si128();
270
271 do
272 {
273 m1A = _mm_loadu_si128(src+0);
274 m2A = _mm_load_si128(dst+0);
275 m1A = _mm_and_si128(m1A, m2A);
276 _mm_store_si128(dst+0, m1A);
277 accA = _mm_or_si128(accA, m1A);
278
279 m1B = _mm_loadu_si128(src+1);
280 m2B = _mm_load_si128(dst+1);
281 m1B = _mm_and_si128(m1B, m2B);
282 _mm_store_si128(dst+1, m1B);
283 accB = _mm_or_si128(accB, m1B);
284
285 m1C = _mm_loadu_si128(src+2);
286 m2C = _mm_load_si128(dst+2);
287 m1C = _mm_and_si128(m1C, m2C);
288 _mm_store_si128(dst+2, m1C);
289 accC = _mm_or_si128(accC, m1C);
290
291 m1D = _mm_loadu_si128(src+3);
292 m2D = _mm_load_si128(dst+3);
293 m1D = _mm_and_si128(m1D, m2D);
294 _mm_store_si128(dst+3, m1D);
295 accD = _mm_or_si128(accD, m1D);
296
297 src += 4; dst += 4;
298 } while (src < src_end);
299
300 accA = _mm_or_si128(accA, accB); // A = A | B
301 accC = _mm_or_si128(accC, accD); // C = C | D
302 accA = _mm_or_si128(accA, accC); // A = A | C
303
304
306 _mm_store_si128((__m128i*)macc, accA);
307 return macc[0] | macc[1] | macc[2] | macc[3];
308}
309
310
311inline
312unsigned sse2_and_block(__m128i* BMRESTRICT dst,
313 const __m128i* BMRESTRICT src,
314 const __m128i* BMRESTRICT src_end) BMNOEXCEPT
315{
316 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
317 __m128i accA, accB, accC, accD;
318
319 accA = _mm_setzero_si128();
320 accB = _mm_setzero_si128();
321 accC = _mm_setzero_si128();
322 accD = _mm_setzero_si128();
323
324 do
325 {
326 m1A = _mm_load_si128(src + 0);
327 m2A = _mm_load_si128(dst + 0);
328 m1A = _mm_and_si128(m1A, m2A);
329 _mm_store_si128(dst + 0, m1A);
330 accA = _mm_or_si128(accA, m1A);
331
332 m1B = _mm_load_si128(src + 1);
333 m2B = _mm_load_si128(dst + 1);
334 m1B = _mm_and_si128(m1B, m2B);
335 _mm_store_si128(dst + 1, m1B);
336 accB = _mm_or_si128(accB, m1B);
337
338 m1C = _mm_load_si128(src + 2);
339 m2C = _mm_load_si128(dst + 2);
340 m1C = _mm_and_si128(m1C, m2C);
341 _mm_store_si128(dst + 2, m1C);
342 accC = _mm_or_si128(accC, m1C);
343
344 m1D = _mm_load_si128(src + 3);
345 m2D = _mm_load_si128(dst + 3);
346 m1D = _mm_and_si128(m1D, m2D);
347 _mm_store_si128(dst + 3, m1D);
348 accD = _mm_or_si128(accD, m1D);
349
350 src += 4; dst += 4;
351 } while (src < src_end);
352
353 accA = _mm_or_si128(accA, accB); // A = A | B
354 accC = _mm_or_si128(accC, accD); // C = C | D
355 accA = _mm_or_si128(accA, accC); // A = A | C
356
357
359 _mm_store_si128((__m128i*)macc, accA);
360 return macc[0] | macc[1] | macc[2] | macc[3];
361}
362
363
364
365/*!
366 @brief OR array elements against another array
367 *dst |= *src
368 @return true if all bits are 1
369 @ingroup SSE2
370*/
371inline
372bool sse2_or_block(__m128i* BMRESTRICT dst,
373 const __m128i* BMRESTRICT src) BMNOEXCEPT
374{
375 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
376 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
377 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
378 const __m128i* BMRESTRICT src_end =
379 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
380
381 do
382 {
383 m1A = _mm_load_si128(src + 0);
384 m2A = _mm_load_si128(dst + 0);
385 m1A = _mm_or_si128(m1A, m2A);
386 _mm_store_si128(dst + 0, m1A);
387
388 m1B = _mm_load_si128(src + 1);
389 m2B = _mm_load_si128(dst + 1);
390 m1B = _mm_or_si128(m1B, m2B);
391 _mm_store_si128(dst + 1, m1B);
392
393 m1C = _mm_load_si128(src + 2);
394 m2C = _mm_load_si128(dst + 2);
395 m1C = _mm_or_si128(m1C, m2C);
396 _mm_store_si128(dst + 2, m1C);
397
398 m1D = _mm_load_si128(src + 3);
399 m2D = _mm_load_si128(dst + 3);
400 m1D = _mm_or_si128(m1D, m2D);
401 _mm_store_si128(dst + 3, m1D);
402
403 mAccF1 = _mm_and_si128(mAccF1, m1C);
404 mAccF1 = _mm_and_si128(mAccF1, m1D);
405 mAccF0 = _mm_and_si128(mAccF0, m1A);
406 mAccF0 = _mm_and_si128(mAccF0, m1B);
407
408 src += 4; dst += 4;
409 } while (src < src_end);
410
411 __m128i maskF = _mm_set1_epi32(~0u);
412 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
413 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
414 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
415
416 return (maskA == 0xFFFFu);
417}
418
419/*!
420 @brief OR array elements against another array (unaligned)
421 *dst |= *src
422 @return true if all bits are 1
423 @ingroup SSE2
424*/
425inline
426bool sse2_or_arr_unal(__m128i* BMRESTRICT dst,
427 const __m128i* BMRESTRICT src,
428 const __m128i* BMRESTRICT src_end) BMNOEXCEPT
429{
430 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
431 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
432 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
433 do
434 {
435 m1A = _mm_loadu_si128(src + 0);
436 m2A = _mm_load_si128(dst + 0);
437 m1A = _mm_or_si128(m1A, m2A);
438 _mm_store_si128(dst + 0, m1A);
439
440 m1B = _mm_loadu_si128(src + 1);
441 m2B = _mm_load_si128(dst + 1);
442 m1B = _mm_or_si128(m1B, m2B);
443 _mm_store_si128(dst + 1, m1B);
444
445 m1C = _mm_loadu_si128(src + 2);
446 m2C = _mm_load_si128(dst + 2);
447 m1C = _mm_or_si128(m1C, m2C);
448 _mm_store_si128(dst + 2, m1C);
449
450 m1D = _mm_loadu_si128(src + 3);
451 m2D = _mm_load_si128(dst + 3);
452 m1D = _mm_or_si128(m1D, m2D);
453 _mm_store_si128(dst + 3, m1D);
454
455 mAccF1 = _mm_and_si128(mAccF1, m1C);
456 mAccF1 = _mm_and_si128(mAccF1, m1D);
457 mAccF0 = _mm_and_si128(mAccF0, m1A);
458 mAccF0 = _mm_and_si128(mAccF0, m1B);
459
460 src += 4; dst += 4;
461 } while (src < src_end);
462
463 __m128i maskF = _mm_set1_epi32(~0u);
464 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
465 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
466 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
467 return (maskA == 0xFFFFu);
468}
469
470/*!
471 @brief OR 2 blocks anc copy result to the destination
472 *dst = *src1 | src2
473 @return true if all bits are 1
474
475 @ingroup SSE2
476*/
477inline
479 const __m128i* BMRESTRICT src1,
480 const __m128i* BMRESTRICT src2) BMNOEXCEPT
481{
482 __m128i m1A, m1B, m1C, m1D;
483 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
484 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
485 const __m128i* BMRESTRICT src_end1 =
486 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
487
488 do
489 {
490 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
491 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
492 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
493 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
494
495 _mm_store_si128(dst + 0, m1A);
496 _mm_store_si128(dst + 1, m1B);
497 _mm_store_si128(dst + 2, m1C);
498 _mm_store_si128(dst + 3, m1D);
499
500 mAccF1 = _mm_and_si128(mAccF1, m1C);
501 mAccF1 = _mm_and_si128(mAccF1, m1D);
502 mAccF0 = _mm_and_si128(mAccF0, m1A);
503 mAccF0 = _mm_and_si128(mAccF0, m1B);
504
505 src1 += 4; src2 += 4; dst += 4;
506
507 } while (src1 < src_end1);
508
509 __m128i maskF = _mm_set1_epi32(~0u);
510 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
511 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
512 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
513 return (maskA == 0xFFFFu);
514}
515
516/*!
517 @brief OR array elements against another 2 arrays
518 *dst |= *src1 | src2
519 @return true if all bits are 1
520
521 @ingroup SSE2
522*/
523inline
525 const __m128i* BMRESTRICT src1,
526 const __m128i* BMRESTRICT src2) BMNOEXCEPT
527{
528 __m128i m1A, m1B, m1C, m1D;
529 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
530 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
531 const __m128i* BMRESTRICT src_end1 =
532 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
533
534 do
535 {
536 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
537 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
538 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
539 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
540
541 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
542 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
543 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
544 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
545
546 _mm_store_si128(dst + 0, m1A);
547 _mm_store_si128(dst + 1, m1B);
548 _mm_store_si128(dst + 2, m1C);
549 _mm_store_si128(dst + 3, m1D);
550
551 mAccF1 = _mm_and_si128(mAccF1, m1C);
552 mAccF1 = _mm_and_si128(mAccF1, m1D);
553 mAccF0 = _mm_and_si128(mAccF0, m1A);
554 mAccF0 = _mm_and_si128(mAccF0, m1B);
555
556 src1 += 4; src2 += 4; dst += 4;
557
558 } while (src1 < src_end1);
559
560 __m128i maskF = _mm_set1_epi32(~0u);
561 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
562 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
563 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
564 return (maskA == 0xFFFFu);
565}
566
567/*!
568 @brief OR array elements against another 2 arrays
569 *dst |= *src1 | src2 | src3 | src4
570 @return true if all bits are 1
571
572 @ingroup SSE2
573*/
574inline
576 const __m128i* BMRESTRICT src1,
577 const __m128i* BMRESTRICT src2,
578 const __m128i* BMRESTRICT src3,
579 const __m128i* BMRESTRICT src4) BMNOEXCEPT
580{
581 __m128i m1A, m1B, m1C, m1D;
582 __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
583 __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
584
585 const __m128i* BMRESTRICT src_end1 =
586 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
587
588 do
589 {
590 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
591 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
592 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
593 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
594
595 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
596 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
597 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
598 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
599
600 m1A = _mm_or_si128(m1A, _mm_load_si128(src3 + 0));
601 m1B = _mm_or_si128(m1B, _mm_load_si128(src3 + 1));
602 m1C = _mm_or_si128(m1C, _mm_load_si128(src3 + 2));
603 m1D = _mm_or_si128(m1D, _mm_load_si128(src3 + 3));
604
605 m1A = _mm_or_si128(m1A, _mm_load_si128(src4 + 0));
606 m1B = _mm_or_si128(m1B, _mm_load_si128(src4 + 1));
607 m1C = _mm_or_si128(m1C, _mm_load_si128(src4 + 2));
608 m1D = _mm_or_si128(m1D, _mm_load_si128(src4 + 3));
609
610 _mm_stream_si128(dst + 0, m1A);
611 _mm_stream_si128(dst + 1, m1B);
612 _mm_stream_si128(dst + 2, m1C);
613 _mm_stream_si128(dst + 3, m1D);
614
615 mAccF1 = _mm_and_si128(mAccF1, m1C);
616 mAccF1 = _mm_and_si128(mAccF1, m1D);
617 mAccF0 = _mm_and_si128(mAccF0, m1A);
618 mAccF0 = _mm_and_si128(mAccF0, m1B);
619
620 src1 += 4; src2 += 4;
621 src3 += 4; src4 += 4;
622
623 _mm_prefetch ((const char*)src3, _MM_HINT_T0);
624 _mm_prefetch ((const char*)src4, _MM_HINT_T0);
625
626 dst += 4;
627
628 } while (src1 < src_end1);
629
630 __m128i maskF = _mm_set1_epi32(~0u);
631 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
632 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
633 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
634 return (maskA == 0xFFFFu);
635}
636
637
638
639/*!
640 @brief XOR block against another
641 *dst ^= *src
642 @return 0 if no bits were set
643 @ingroup SSE2
644*/
645inline
646unsigned sse2_xor_block(__m128i* BMRESTRICT dst,
647 const __m128i* BMRESTRICT src) BMNOEXCEPT
648{
649 __m128i m1A, m1B, m1C, m1D;
650 __m128i accA, accB, accC, accD;
651
652 const __m128i* BMRESTRICT src_end =
653 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
654
655 accA = accB = accC = accD = _mm_setzero_si128();
656
657 do
658 {
659 m1A = _mm_xor_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
660 m1B = _mm_xor_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
661 m1C = _mm_xor_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
662 m1D = _mm_xor_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
663
664 _mm_store_si128(dst+0, m1A);
665 _mm_store_si128(dst+1, m1B);
666 _mm_store_si128(dst+2, m1C);
667 _mm_store_si128(dst+3, m1D);
668
669 accA = _mm_or_si128(accA, m1A);
670 accB = _mm_or_si128(accB, m1B);
671 accC = _mm_or_si128(accC, m1C);
672 accD = _mm_or_si128(accD, m1D);
673
674 src += 4; dst += 4;
675 } while (src < src_end);
676
677 accA = _mm_or_si128(accA, accB); // A = A | B
678 accC = _mm_or_si128(accC, accD); // C = C | D
679 accA = _mm_or_si128(accA, accC); // A = A | C
680
682 _mm_store_si128((__m128i*)macc, accA);
683 return macc[0] | macc[1] | macc[2] | macc[3];
684}
685
686/*!
687 @brief 3 operand XOR
688 *dst = *src1 ^ src2
689 @return 0 if no bits were set
690 @ingroup SSE2
691*/
692inline
693unsigned sse2_xor_block_2way(__m128i* BMRESTRICT dst,
694 const __m128i* BMRESTRICT src1,
695 const __m128i* BMRESTRICT src2) BMNOEXCEPT
696{
697 __m128i m1A, m1B, m1C, m1D;
698 __m128i accA, accB, accC, accD;
699
700 const __m128i* BMRESTRICT src1_end =
701 (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
702
703 accA = accB = accC = accD = _mm_setzero_si128();
704
705 do
706 {
707 m1A = _mm_xor_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
708 m1B = _mm_xor_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
709 m1C = _mm_xor_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
710 m1D = _mm_xor_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
711
712 _mm_store_si128(dst + 0, m1A);
713 _mm_store_si128(dst + 1, m1B);
714 _mm_store_si128(dst + 2, m1C);
715 _mm_store_si128(dst + 3, m1D);
716
717 accA = _mm_or_si128(accA, m1A);
718 accB = _mm_or_si128(accB, m1B);
719 accC = _mm_or_si128(accC, m1C);
720 accD = _mm_or_si128(accD, m1D);
721
722 src1 += 4; src2 += 4; dst += 4;
723 } while (src1 < src1_end);
724
725 accA = _mm_or_si128(accA, accB); // A = A | B
726 accC = _mm_or_si128(accC, accD); // C = C | D
727 accA = _mm_or_si128(accA, accC); // A = A | C
728
730 _mm_store_si128((__m128i*)macc, accA);
731 return macc[0] | macc[1] | macc[2] | macc[3];
732}
733
734
735/*!
736 @brief AND-NOT (SUB) array elements against another array
737 *dst &= ~*src
738
739 @return 0 if no bits were set
740
741 @ingroup SSE2
742*/
743inline
744unsigned sse2_sub_block(__m128i* BMRESTRICT dst,
745 const __m128i* BMRESTRICT src) BMNOEXCEPT
746{
747 __m128i m1A, m1B, m1C, m1D;
748 __m128i accA, accB, accC, accD;
749
750 accA = accB = accC = accD = _mm_setzero_si128();
751
752 const __m128i* BMRESTRICT src_end =
753 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
754
755 do
756 {
757 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
758 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
759 _mm_store_si128(dst+0, m1A);
760 _mm_store_si128(dst+1, m1B);
761 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
762 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
763 _mm_store_si128(dst+2, m1C);
764 _mm_store_si128(dst+3, m1D);
765
766 accA = _mm_or_si128(accA, m1A);
767 accB = _mm_or_si128(accB, m1B);
768 accC = _mm_or_si128(accC, m1C);
769 accD = _mm_or_si128(accD, m1D);
770
771 src += 4; dst += 4;
772
773 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
774 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
775 _mm_store_si128(dst+0, m1A);
776 _mm_store_si128(dst+1, m1B);
777 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
778 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
779 _mm_store_si128(dst+2, m1C);
780 _mm_store_si128(dst+3, m1D);
781
782 accA = _mm_or_si128(accA, m1A);
783 accB = _mm_or_si128(accB, m1B);
784 accC = _mm_or_si128(accC, m1C);
785 accD = _mm_or_si128(accD, m1D);
786
787 src += 4; dst += 4;
788 } while (src < src_end);
789
790 accA = _mm_or_si128(accA, accB); // A = A | B
791 accC = _mm_or_si128(accC, accD); // C = C | D
792 accA = _mm_or_si128(accA, accC); // A = A | C
793
794
796 _mm_store_si128((__m128i*)macc, accA);
797 return macc[0] | macc[1] | macc[2] | macc[3];
798}
799
800
801/*!
802 @brief SSE2 block memset
803 *dst = value
804
805 @ingroup SSE2
806*/
807
808inline
810{
811 __m128i* BMRESTRICT dst_end =
812 (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
813
814 __m128i xmm0 = _mm_set1_epi32((int)value);
815 do
816 {
817 _mm_store_si128(dst, xmm0);
818 _mm_store_si128(dst+1, xmm0);
819 _mm_store_si128(dst+2, xmm0);
820 _mm_store_si128(dst+3, xmm0);
821
822 _mm_store_si128(dst+4, xmm0);
823 _mm_store_si128(dst+5, xmm0);
824 _mm_store_si128(dst+6, xmm0);
825 _mm_store_si128(dst+7, xmm0);
826
827 dst += 8;
828 } while (dst < dst_end);
829}
830
831/*!
832 @brief SSE2 block copy
833 *dst = *src
834
835 @ingroup SSE2
836*/
837inline
838void sse2_copy_block(__m128i* BMRESTRICT dst,
839 const __m128i* BMRESTRICT src) BMNOEXCEPT
840{
841 __m128i xmm0, xmm1, xmm2, xmm3;
842 const __m128i* BMRESTRICT src_end =
843 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
844
845 do
846 {
847 xmm0 = _mm_load_si128(src+0);
848 xmm1 = _mm_load_si128(src+1);
849 xmm2 = _mm_load_si128(src+2);
850 xmm3 = _mm_load_si128(src+3);
851
852 _mm_store_si128(dst+0, xmm0);
853 _mm_store_si128(dst+1, xmm1);
854 _mm_store_si128(dst+2, xmm2);
855 _mm_store_si128(dst+3, xmm3);
856
857 xmm0 = _mm_load_si128(src+4);
858 xmm1 = _mm_load_si128(src+5);
859 xmm2 = _mm_load_si128(src+6);
860 xmm3 = _mm_load_si128(src+7);
861
862 _mm_store_si128(dst+4, xmm0);
863 _mm_store_si128(dst+5, xmm1);
864 _mm_store_si128(dst+6, xmm2);
865 _mm_store_si128(dst+7, xmm3);
866
867 src += 8; dst += 8;
868
869 } while (src < src_end);
870}
871
872/*!
873 @brief SSE2 block copy (unaligned SRC)
874 *dst = *src
875
876 @ingroup SSE2
877*/
878inline
880 const __m128i* BMRESTRICT src) BMNOEXCEPT
881{
882 __m128i xmm0, xmm1, xmm2, xmm3;
883 const __m128i* BMRESTRICT src_end =
884 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
885
886 do
887 {
888 xmm0 = _mm_loadu_si128(src+0);
889 xmm1 = _mm_loadu_si128(src+1);
890 xmm2 = _mm_loadu_si128(src+2);
891 xmm3 = _mm_loadu_si128(src+3);
892
893 _mm_store_si128(dst+0, xmm0);
894 _mm_store_si128(dst+1, xmm1);
895 _mm_store_si128(dst+2, xmm2);
896 _mm_store_si128(dst+3, xmm3);
897
898 xmm0 = _mm_loadu_si128(src+4);
899 xmm1 = _mm_loadu_si128(src+5);
900 xmm2 = _mm_loadu_si128(src+6);
901 xmm3 = _mm_loadu_si128(src+7);
902
903 _mm_store_si128(dst+4, xmm0);
904 _mm_store_si128(dst+5, xmm1);
905 _mm_store_si128(dst+6, xmm2);
906 _mm_store_si128(dst+7, xmm3);
907
908 src += 8; dst += 8;
909
910 } while (src < src_end);
911}
912
913
914/*!
915 @brief SSE2 block copy
916 *dst = *src
917
918 @ingroup SSE2
919*/
920inline
922 const __m128i* BMRESTRICT src) BMNOEXCEPT
923{
924 __m128i xmm0, xmm1, xmm2, xmm3;
925 const __m128i* BMRESTRICT src_end =
926 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
927
928 do
929 {
930 xmm0 = _mm_load_si128(src+0);
931 xmm1 = _mm_load_si128(src+1);
932 xmm2 = _mm_load_si128(src+2);
933 xmm3 = _mm_load_si128(src+3);
934
935 _mm_stream_si128(dst+0, xmm0);
936 _mm_stream_si128(dst+1, xmm1);
937 _mm_stream_si128(dst+2, xmm2);
938 _mm_stream_si128(dst+3, xmm3);
939
940 xmm0 = _mm_load_si128(src+4);
941 xmm1 = _mm_load_si128(src+5);
942 xmm2 = _mm_load_si128(src+6);
943 xmm3 = _mm_load_si128(src+7);
944
945 _mm_stream_si128(dst+4, xmm0);
946 _mm_stream_si128(dst+5, xmm1);
947 _mm_stream_si128(dst+6, xmm2);
948 _mm_stream_si128(dst+7, xmm3);
949
950 src += 8; dst += 8;
951
952 } while (src < src_end);
953}
954
955/*!
956 @brief SSE2 block copy (unaligned src)
957 *dst = *src
958
959 @ingroup SSE2
960*/
961inline
963 const __m128i* BMRESTRICT src) BMNOEXCEPT
964{
965 __m128i xmm0, xmm1, xmm2, xmm3;
966 const __m128i* BMRESTRICT src_end =
967 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
968
969 do
970 {
971 xmm0 = _mm_loadu_si128(src+0);
972 xmm1 = _mm_loadu_si128(src+1);
973 xmm2 = _mm_loadu_si128(src+2);
974 xmm3 = _mm_loadu_si128(src+3);
975
976 _mm_stream_si128(dst+0, xmm0);
977 _mm_stream_si128(dst+1, xmm1);
978 _mm_stream_si128(dst+2, xmm2);
979 _mm_stream_si128(dst+3, xmm3);
980
981 xmm0 = _mm_loadu_si128(src+4);
982 xmm1 = _mm_loadu_si128(src+5);
983 xmm2 = _mm_loadu_si128(src+6);
984 xmm3 = _mm_loadu_si128(src+7);
985
986 _mm_stream_si128(dst+4, xmm0);
987 _mm_stream_si128(dst+5, xmm1);
988 _mm_stream_si128(dst+6, xmm2);
989 _mm_stream_si128(dst+7, xmm3);
990
991 src += 8; dst += 8;
992
993 } while (src < src_end);
994}
995
996
997/*!
998 @brief Invert bit block
999 *dst = ~*dst
1000 or
1001 *dst ^= *dst
1002
1003 @ingroup SSE2
1004*/
1005inline
1007{
1008 __m128i maskF = _mm_set1_epi32(~0u);
1009 __m128i* BMRESTRICT dst_end =
1010 (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
1011
1012 __m128i mA, mB, mC, mD;
1013 do
1014 {
1015 mA = _mm_load_si128(dst + 0);
1016 mB = _mm_load_si128(dst + 1);
1017 mA = _mm_xor_si128(mA, maskF);
1018 mB = _mm_xor_si128(mB, maskF);
1019 _mm_store_si128(dst, mA);
1020 _mm_store_si128(dst + 1, mB);
1021
1022 mC = _mm_load_si128(dst + 2);
1023 mD = _mm_load_si128(dst + 3);
1024 mC = _mm_xor_si128(mC, maskF);
1025 mD = _mm_xor_si128(mD, maskF);
1026 _mm_store_si128(dst + 2, mC);
1027 _mm_store_si128(dst + 3, mD);
1028
1029 dst += 4;
1030
1031 } while (dst < (__m128i*)dst_end);
1032}
1033
1035__m128i sse2_and(__m128i a, __m128i b) BMNOEXCEPT
1036{
1037 return _mm_and_si128(a, b);
1038}
1039
1041__m128i sse2_or(__m128i a, __m128i b) BMNOEXCEPT
1042{
1043 return _mm_or_si128(a, b);
1044}
1045
1046
1048__m128i sse2_xor(__m128i a, __m128i b) BMNOEXCEPT
1049{
1050 return _mm_xor_si128(a, b);
1051}
1052
1054__m128i sse2_sub(__m128i a, __m128i b) BMNOEXCEPT
1055{
1056 return _mm_andnot_si128(b, a);
1057}
1058
1059
1060/*!
1061 @brief Gap block population count (array sum) utility
1062 @param pbuf - unrolled, aligned to 1-start GAP buffer
1063 @param sse_vect_waves - number of SSE vector lines to process
1064 @param sum - result acumulator
1065 @return tail pointer
1066
1067 @internal
1068 @ingroup SSE2
1069*/
1070inline
1072 const bm::gap_word_t* BMRESTRICT pbuf,
1073 unsigned sse_vect_waves,
1074 unsigned* sum) BMNOEXCEPT
1075{
1076 __m128i xcnt = _mm_setzero_si128();
1077
1078 for (unsigned i = 0; i < sse_vect_waves; ++i)
1079 {
1080 __m128i mm0 = _mm_loadu_si128((__m128i*)(pbuf - 1));
1081 __m128i mm1 = _mm_loadu_si128((__m128i*)(pbuf + 8 - 1));
1082 __m128i mm_s2 = _mm_add_epi16(mm1, mm0);
1083 xcnt = _mm_add_epi16(xcnt, mm_s2);
1084 pbuf += 16;
1085 }
1086 xcnt = _mm_sub_epi16(_mm_srli_epi32(xcnt, 16), xcnt);
1087
1088 unsigned short* cnt8 = (unsigned short*)&xcnt;
1089 *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
1090 return pbuf;
1091}
1092
1093/**
1094 lower bound (great or equal) linear scan in ascending order sorted array
1095 @ingroup SSE2
1096 \internal
1097*/
1098inline
1099unsigned sse2_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
1100 unsigned target,
1101 unsigned from,
1102 unsigned to) BMNOEXCEPT
1103{
1104 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1105 // see more at:
1106 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1107
1108 const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
1109
1110 unsigned unroll_factor = 8;
1111 unsigned len = to - from + 1;
1112 unsigned len_unr = len - (len % unroll_factor);
1113
1114 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1115 __m128i vect_target = _mm_set1_epi32(target);
1116 __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
1117
1118 int mask;
1119 __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1120
1121 unsigned k = 0;
1122 for (; k < len_unr; k+=unroll_factor)
1123 {
1124 vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
1125 norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
1126
1127 cmp_mask_ge = _mm_or_si128( // GT | EQ
1128 _mm_cmpgt_epi32 (norm_vect40, norm_target),
1129 _mm_cmpeq_epi32 (vect40, vect_target)
1130 );
1131 mask = _mm_movemask_epi8(cmp_mask_ge);
1132 if (mask)
1133 {
1134 int bsf = bm::bit_scan_forward32(mask); //_bit_scan_forward(mask);
1135 return from + k + (bsf / 4);
1136 }
1137 vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
1138 norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
1139
1140 cmp_mask_ge = _mm_or_si128(
1141 _mm_cmpgt_epi32 (norm_vect41, norm_target),
1142 _mm_cmpeq_epi32 (vect41, vect_target)
1143 );
1144 mask = _mm_movemask_epi8(cmp_mask_ge);
1145 if (mask)
1146 {
1147 int bsf = bm::bit_scan_forward32(mask); //_bit_scan_forward(mask);
1148 return 4 + from + k + (bsf / 4);
1149 }
1150 } // for
1151
1152 for (; k < len; ++k)
1153 {
1154 if (arr_base[k] >= target)
1155 return from + k;
1156 }
1157 return to + 1;
1158}
1159
1160
1161#ifdef __GNUG__
1162#pragma GCC diagnostic pop
1163#endif
1164
1165
1166} // namespace
1167
1168
1169
1170#endif
#define BM_ALIGN16
Definition: bmdef.h:287
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN16ATTR
Definition: bmdef.h:288
#define BMFORCEINLINE
Definition: bmdef.h:213
SSE2 reinitialization guard class.
Definition: bmsse_util.h:56
BMFORCEINLINE sse_empty_guard() BMNOEXCEPT
Definition: bmsse_util.h:58
BMFORCEINLINE ~sse_empty_guard() BMNOEXCEPT
Definition: bmsse_util.h:63
void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
Definition: bmsse_util.h:838
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
3 operand XOR dst = *src1 ^ src2
Definition: bmsse_util.h:693
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
Definition: bmsse_util.h:575
void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
Definition: bmsse_util.h:921
void sse2_stream_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned src) dst = *src.
Definition: bmsse_util.h:962
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition: bmsse_util.h:744
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
XOR array elements to specified mask dst = *src ^ mask.
Definition: bmsse_util.h:78
void sse2_copy_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned SRC) dst = *src.
Definition: bmsse_util.h:879
unsigned sse2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to) BMNOEXCEPT
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse_util.h:1099
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
XOR block against another dst ^= *src.
Definition: bmsse_util.h:646
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
OR array elements against another array (unaligned) dst |= *src.
Definition: bmsse_util.h:426
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
Definition: bmsse_util.h:126
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
AND array elements against another array (unaligned) dst &= *src.
Definition: bmsse_util.h:259
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition: bmsse_util.h:102
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition: bmsse_util.h:524
void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
SSE2 block memset dst = value.
Definition: bmsse_util.h:809
void sse2_invert_block(__m128i *BMRESTRICT dst) BMNOEXCEPT
Invert bit block dst = ~*dst or dst ^= *dst.
Definition: bmsse_util.h:1006
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
OR array elements against another array dst |= *src.
Definition: bmsse_util.h:372
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
Definition: bmsse_util.h:478
Definition: bm.h:78
unsigned int word_t
Definition: bmconst.h:39
BMFORCEINLINE unsigned bit_scan_forward32(unsigned w) BMNOEXCEPT
Definition: bmutil.h:319
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1041
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1035
const unsigned set_block_size
Definition: bmconst.h:55
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1054
unsigned int id_t
Definition: bmconst.h:38
unsigned short gap_word_t
Definition: bmconst.h:78
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1048
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum) BMNOEXCEPT
Gap block population count (array sum) utility.
Definition: bmsse_util.h:1071