BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse4.h
22  \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 #include<smmintrin.h>
28 #include<nmmintrin.h>
29 #include<immintrin.h>
30 
31 #include "bmdef.h"
32 #include "bmsse_util.h"
33 #include "bmutil.h"
34 
35 namespace bm
36 {
37 
38 /** @defgroup SSE4 SSE4.2 funcions (internal)
39  Processor specific optimizations for SSE4.2 instructions (internals)
40  @internal
41  @ingroup bvector
42  */
43 
44 #ifdef __GNUG__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
47 #endif
48 
49 #ifdef _MSC_VER
50 #pragma warning( push )
51 #pragma warning( disable : 4146)
52 #endif
53 
54 
55 /*
56 inline
57 void sse2_print128(const char* prefix, const __m128i & value)
58 {
59  const size_t n = sizeof(__m128i) / sizeof(unsigned);
60  unsigned buffer[n];
61  _mm_storeu_si128((__m128i*)buffer, value);
62  std::cout << prefix << " [ ";
63  for (int i = n-1; 1; --i)
64  {
65  std::cout << buffer[i] << " ";
66  if (i == 0)
67  break;
68  }
69  std::cout << "]" << std::endl;
70 }
71 */
72 
73 /*!
74  SSE4.2 optimized bitcounting .
75  @ingroup SSE4
76 */
77 inline
78 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
79 {
80  bm::id_t count = 0;
81 #ifdef BM64_SSE4
82  const bm::id64_t* b = (bm::id64_t*) block;
83  const bm::id64_t* b_end = (bm::id64_t*) block_end;
84  do
85  {
86  count += unsigned( _mm_popcnt_u64(b[0]) +
87  _mm_popcnt_u64(b[1]));
88  b += 2;
89  } while (b < b_end);
90 #else
91  do
92  {
93  const unsigned* b = (unsigned*) block;
94  count += _mm_popcnt_u32(b[0]) +
95  _mm_popcnt_u32(b[1]) +
96  _mm_popcnt_u32(b[2]) +
97  _mm_popcnt_u32(b[3]);
98  } while (++block < block_end);
99 #endif
100  return count;
101 }
102 
103 /*!
104 \internal
105 */
107 unsigned op_xor(unsigned a, unsigned b)
108 {
109  unsigned ret = (a ^ b);
110  return ret;
111 }
112 
113 /*!
114 \internal
115 */
117 unsigned op_or(unsigned a, unsigned b)
118 {
119  return (a | b);
120 }
121 
122 /*!
123 \internal
124 */
126 unsigned op_and(unsigned a, unsigned b)
127 {
128  return (a & b);
129 }
130 
131 
132 template<class Func>
133 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
134  const __m128i* BMRESTRICT block_end,
135  const __m128i* BMRESTRICT mask_block,
136  Func sse2_func)
137 {
138  bm::id_t count = 0;
139 #ifdef BM64_SSE4
140  do
141  {
142  __m128i tmp0 = _mm_load_si128(block);
143  __m128i tmp1 = _mm_load_si128(mask_block);
144  __m128i b = sse2_func(tmp0, tmp1);
145 
146  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
148 
149  ++block; ++mask_block;
150  } while (block < block_end);
151 #else
152  do
153  {
154  __m128i tmp0 = _mm_load_si128(block);
155  __m128i tmp1 = _mm_load_si128(mask_block);
156  __m128i b = sse2_func(tmp0, tmp1);
157 
158  count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159  count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160  count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161  count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
162 
163  ++block; ++mask_block;
164  } while (block < block_end);
165 #endif
166 
167  return count;
168 }
169 
170 /*!
171  @brief check if block is all zero bits
172  @ingroup SSE4
173 */
174 inline
175 bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
176 {
177  __m128i w;
178  __m128i maskz = _mm_setzero_si128();
179  const __m128i* BMRESTRICT block_end =
180  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
181 
182  do
183  {
184  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
186  return false;
187  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
189  return false;
190  block += 4;
191  } while (block < block_end);
192  return true;
193 }
194 
195 /*!
196  @brief check if digest stride is all zero bits
197  @ingroup SSE4
198 */
199 inline
200 bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
201 {
202  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204  wA = _mm_or_si128(wA, wB);
205  bool z1 = _mm_test_all_zeros(wA, wA);
206 
207  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209  wA = _mm_or_si128(wA, wB);
210  bool z2 = _mm_test_all_zeros(wA, wA);
211  return z1 & z2;
212 }
213 
214 /*!
215  @brief set digest stride to 0xFF.. or 0x0 value
216  @ingroup SSE4
217 */
218 inline
219 void sse4_block_set_digest(__m128i* dst, unsigned value)
220 {
221  __m128i mV = _mm_set1_epi32(int(value));
222  _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223  _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224  _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225  _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
226 }
227 
228 
229 /*!
230  @brief AND blocks2
231  *dst &= *src
232 
233  @return 0 if no bits were set
234  @ingroup SSE4
235 */
236 inline
237 unsigned sse4_and_block(__m128i* BMRESTRICT dst,
238  const __m128i* BMRESTRICT src)
239 {
240  __m128i m1A, m1B, m1C, m1D;
241  __m128i accA, accB, accC, accD;
242 
243  const __m128i* BMRESTRICT src_end =
244  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
245 
246  accA = accB = accC = accD = _mm_setzero_si128();
247 
248  do
249  {
250  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
254 
255  _mm_store_si128(dst+0, m1A);
256  _mm_store_si128(dst+1, m1B);
257  _mm_store_si128(dst+2, m1C);
258  _mm_store_si128(dst+3, m1D);
259 
260  accA = _mm_or_si128(accA, m1A);
261  accB = _mm_or_si128(accB, m1B);
262  accC = _mm_or_si128(accC, m1C);
263  accD = _mm_or_si128(accD, m1D);
264 
265  src += 4; dst += 4;
266  } while (src < src_end);
267 
268  accA = _mm_or_si128(accA, accB); // A = A | B
269  accC = _mm_or_si128(accC, accD); // C = C | D
270  accA = _mm_or_si128(accA, accC); // A = A | C
271 
272  return !_mm_testz_si128(accA, accA);
273 }
274 
275 
276 /*!
277  @brief AND block digest stride
278  *dst &= *src
279 
280  @return true if stide is all zero
281  @ingroup SSE4
282 */
283 inline
284 bool sse4_and_digest(__m128i* BMRESTRICT dst,
285  const __m128i* BMRESTRICT src)
286 {
287  __m128i m1A, m1B, m1C, m1D;
288 
289  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
293 
294  _mm_store_si128(dst+0, m1A);
295  _mm_store_si128(dst+1, m1B);
296  _mm_store_si128(dst+2, m1C);
297  _mm_store_si128(dst+3, m1D);
298 
299  m1A = _mm_or_si128(m1A, m1B);
300  m1C = _mm_or_si128(m1C, m1D);
301  m1A = _mm_or_si128(m1A, m1C);
302 
303  bool z1 = _mm_testz_si128(m1A, m1A);
304 
305  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
309 
310  _mm_store_si128(dst+4, m1A);
311  _mm_store_si128(dst+5, m1B);
312  _mm_store_si128(dst+6, m1C);
313  _mm_store_si128(dst+7, m1D);
314 
315  m1A = _mm_or_si128(m1A, m1B);
316  m1C = _mm_or_si128(m1C, m1D);
317  m1A = _mm_or_si128(m1A, m1C);
318 
319  bool z2 = _mm_testz_si128(m1A, m1A);
320 
321  return z1 & z2;
322 }
323 
324 /*!
325  @brief AND block digest stride
326  *dst = *src1 & src2
327 
328  @return true if stide is all zero
329  @ingroup SSE4
330 */
331 inline
333  const __m128i* BMRESTRICT src1,
334  const __m128i* BMRESTRICT src2)
335 {
336  __m128i m1A, m1B, m1C, m1D;
337 
338  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
342 
343  _mm_store_si128(dst+0, m1A);
344  _mm_store_si128(dst+1, m1B);
345  _mm_store_si128(dst+2, m1C);
346  _mm_store_si128(dst+3, m1D);
347 
348  m1A = _mm_or_si128(m1A, m1B);
349  m1C = _mm_or_si128(m1C, m1D);
350  m1A = _mm_or_si128(m1A, m1C);
351 
352  bool z1 = _mm_testz_si128(m1A, m1A);
353 
354  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
358 
359  _mm_store_si128(dst+4, m1A);
360  _mm_store_si128(dst+5, m1B);
361  _mm_store_si128(dst+6, m1C);
362  _mm_store_si128(dst+7, m1D);
363 
364  m1A = _mm_or_si128(m1A, m1B);
365  m1C = _mm_or_si128(m1C, m1D);
366  m1A = _mm_or_si128(m1A, m1C);
367 
368  bool z2 = _mm_testz_si128(m1A, m1A);
369 
370  return z1 & z2;
371 }
372 
373 /*!
374  @brief AND block digest stride
375  @return true if stide is all zero
376  @ingroup SSE4
377 */
378 inline
380  const __m128i* BMRESTRICT src1,
381  const __m128i* BMRESTRICT src2,
382  const __m128i* BMRESTRICT src3,
383  const __m128i* BMRESTRICT src4)
384 {
385  __m128i m1A, m1B, m1C, m1D;
386  __m128i m1E, m1F, m1G, m1H;
387 
388  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
392 
393  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
397 
398  m1A = _mm_and_si128(m1A, m1E);
399  m1B = _mm_and_si128(m1B, m1F);
400  m1C = _mm_and_si128(m1C, m1G);
401  m1D = _mm_and_si128(m1D, m1H);
402 
403  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
407 
408  _mm_store_si128(dst+0, m1A);
409  _mm_store_si128(dst+1, m1B);
410  _mm_store_si128(dst+2, m1C);
411  _mm_store_si128(dst+3, m1D);
412 
413  m1A = _mm_or_si128(m1A, m1B);
414  m1C = _mm_or_si128(m1C, m1D);
415  m1A = _mm_or_si128(m1A, m1C);
416 
417  bool z1 = _mm_testz_si128(m1A, m1A);
418 
419  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
423 
424  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
428 
429  m1A = _mm_and_si128(m1A, m1E);
430  m1B = _mm_and_si128(m1B, m1F);
431  m1C = _mm_and_si128(m1C, m1G);
432  m1D = _mm_and_si128(m1D, m1H);
433 
434  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
438 
439  _mm_store_si128(dst+4, m1A);
440  _mm_store_si128(dst+5, m1B);
441  _mm_store_si128(dst+6, m1C);
442  _mm_store_si128(dst+7, m1D);
443 
444  m1A = _mm_or_si128(m1A, m1B);
445  m1C = _mm_or_si128(m1C, m1D);
446  m1A = _mm_or_si128(m1A, m1C);
447 
448  bool z2 = _mm_testz_si128(m1A, m1A);
449 
450  return z1 & z2;
451 }
452 
453 
454 /*!
455  @brief SUB (AND NOT) block digest stride
456  *dst &= ~*src
457 
458  @return true if stide is all zero
459  @ingroup SSE4
460 */
461 inline
462 bool sse4_sub_digest(__m128i* BMRESTRICT dst,
463  const __m128i* BMRESTRICT src)
464 {
465  __m128i m1A, m1B, m1C, m1D;
466 
467  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
471 
472  _mm_store_si128(dst+0, m1A);
473  _mm_store_si128(dst+1, m1B);
474  _mm_store_si128(dst+2, m1C);
475  _mm_store_si128(dst+3, m1D);
476 
477  m1A = _mm_or_si128(m1A, m1B);
478  m1C = _mm_or_si128(m1C, m1D);
479  m1A = _mm_or_si128(m1A, m1C);
480 
481  bool z1 = _mm_testz_si128(m1A, m1A);
482 
483  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
487 
488  _mm_store_si128(dst+4, m1A);
489  _mm_store_si128(dst+5, m1B);
490  _mm_store_si128(dst+6, m1C);
491  _mm_store_si128(dst+7, m1D);
492 
493  m1A = _mm_or_si128(m1A, m1B);
494  m1C = _mm_or_si128(m1C, m1D);
495  m1A = _mm_or_si128(m1A, m1C);
496 
497  bool z2 = _mm_testz_si128(m1A, m1A);
498 
499  return z1 & z2;
500 }
501 
502 
503 /*!
504  @brief 2-operand SUB (AND NOT) block digest stride
505  *dst = src1 & ~*src2
506 
507  @return true if stide is all zero
508  @ingroup SSE4
509 */
510 inline
512  const __m128i* BMRESTRICT src1,
513  const __m128i* BMRESTRICT src2)
514 {
515  __m128i m1A, m1B, m1C, m1D;
516 
517  m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518  m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519  m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520  m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
521 
522  _mm_store_si128(dst+0, m1A);
523  _mm_store_si128(dst+1, m1B);
524  _mm_store_si128(dst+2, m1C);
525  _mm_store_si128(dst+3, m1D);
526 
527  m1A = _mm_or_si128(m1A, m1B);
528  m1C = _mm_or_si128(m1C, m1D);
529  m1A = _mm_or_si128(m1A, m1C);
530 
531  bool z1 = _mm_testz_si128(m1A, m1A);
532 
533  m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534  m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535  m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536  m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
537 
538  _mm_store_si128(dst+4, m1A);
539  _mm_store_si128(dst+5, m1B);
540  _mm_store_si128(dst+6, m1C);
541  _mm_store_si128(dst+7, m1D);
542 
543  m1A = _mm_or_si128(m1A, m1B);
544  m1C = _mm_or_si128(m1C, m1D);
545  m1A = _mm_or_si128(m1A, m1C);
546 
547  bool z2 = _mm_testz_si128(m1A, m1A);
548 
549  return z1 & z2;
550 }
551 
552 
553 
554 /*!
555  @brief check if block is all zero bits
556  @ingroup SSE4
557 */
558 inline
559 bool sse4_is_all_one(const __m128i* BMRESTRICT block)
560 {
561  __m128i w;
562  const __m128i* BMRESTRICT block_end =
563  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
564 
565  do
566  {
567  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568  if (!_mm_test_all_ones(w))
569  return false;
570  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571  if (!_mm_test_all_ones(w))
572  return false;
573 
574  block+=4;
575  } while (block < block_end);
576  return true;
577 }
578 
579 /*!
580  @brief check if wave of pointers is all NULL
581  @ingroup SSE4
582 */
584 bool sse42_test_all_zero_wave(const void* ptr)
585 {
586  __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
587  return _mm_testz_si128(w0, w0);
588 }
589 
590 /*!
591  @brief check if 2 waves of pointers are all NULL
592  @ingroup SSE4
593 */
595 bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1)
596 {
597  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
598  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
599  w0 = _mm_or_si128(w0, w1);
600  return _mm_testz_si128(w0, w0);
601 }
602 
603 /*!
604  @brief check if wave of 2 pointers are the same (null or FULL)
605  @ingroup SSE4
606 */
608 bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1)
609 {
610  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
611  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
612  w0 = _mm_xor_si128(w0, w1);
613  return _mm_testz_si128(w0, w0);
614 }
615 
616 
617 /*!
618  SSE4.2 calculate number of bit changes from 0 to 1
619  @ingroup SSE4
620 */
621 inline
622 unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block)
623 {
624  const __m128i* block_end =
625  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
626  __m128i m1COshft, m2COshft;
627 
628  unsigned w0 = *((bm::word_t*)(block));
629  unsigned count = 1;
630 
631  unsigned co2, co1 = 0;
632  for (;block < block_end; block += 2)
633  {
634  __m128i m1A = _mm_load_si128(block);
635  __m128i m2A = _mm_load_si128(block+1);
636 
637  __m128i m1CO = _mm_srli_epi32(m1A, 31);
638  __m128i m2CO = _mm_srli_epi32(m2A, 31);
639 
640  co2 = _mm_extract_epi32(m1CO, 3);
641 
642  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
643  __m128i m2As = _mm_slli_epi32(m2A, 1);
644 
645  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
646  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
647 
648  co1 = co2;
649 
650  co2 = _mm_extract_epi32(m2CO, 3);
651 
652  m2COshft = _mm_slli_si128 (m2CO, 4);
653  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
654 
655  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
656  m2As = _mm_or_si128(m2As, m2COshft);
657 
658  co1 = co2;
659 
660  // we now have two shifted SSE4 regs with carry-over
661  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
662  m2A = _mm_xor_si128(m2A, m2As);
663 
664  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
665  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
666  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
667 
668  m0 = _mm_extract_epi64(m2A, 0);
669  m1 = _mm_extract_epi64(m2A, 1);
670  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
671  }
672  count -= (w0 & 1u); // correct initial carry-in error
673  return count;
674 }
675 
676 
677 #ifdef __GNUG__
678 // necessary measure to silence false warning from GCC about negative pointer arithmetics
679 #pragma GCC diagnostic push
680 #pragma GCC diagnostic ignored "-Warray-bounds"
681 #endif
682 
683 /*!
684  SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
685  @ingroup SSE4
686  \internal
687 */
688 inline
689 unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
690 {
691  BM_ASSERT(size <= 16);
692  BM_ASSERT(size);
693 
694  const unsigned unroll_factor = 8;
695  if (size < 4) // for very short vector use conventional scan
696  {
697  unsigned j;
698  for (j = 0; j < size; ++j)
699  {
700  if (pbuf[j] >= pos)
701  break;
702  }
703  return j;
704  }
705 
706  __m128i m1, mz, maskF, maskFL;
707 
708  mz = _mm_setzero_si128();
709  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
710 
711  maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
712  maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
713  int shiftL= (64 - (unroll_factor - size) * 16);
714  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
715 
716  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
717  m1 = _mm_or_si128(m1, maskFL);
718 
719  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
720  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
721  __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
722  int mi = _mm_movemask_epi8(c_mask); // collect flag bits
723  if (mi)
724  {
725  // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
726  unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
727  return unroll_factor - bc; // address of first one element (target)
728  }
729  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
730  // GCC gives a false warning for "- unroll_factor" here
731  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
732  BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
733 
734  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
735  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
736  mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
737  unsigned bc = _mm_popcnt_u32(mi);
738 
739  return size - bc;
740 }
741 
742 /**
743  Experimental (test) function to do SIMD vector search (lower bound)
744  in sorted, growing array
745  @ingroup SSE4
746 
747  \internal
748 */
749 inline
750 int sse42_cmpge_u32(__m128i vect4, unsigned value)
751 {
752  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
753  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
754  //
755  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
756  __m128i mm_val = _mm_set1_epi32(value);
757 
758  __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
759  __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
760 
761  __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
762  __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
763 
764  __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
765  int mask = _mm_movemask_epi8(cmp_mask_ge);
766  if (mask)
767  {
768  int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
769  return bsf / 4;
770  }
771  return -1;
772 }
773 
774 
775 /**
776  lower bound (great or equal) linear scan in ascending order sorted array
777  @ingroup SSE4
778  \internal
779 */
780 inline
781 unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
782  unsigned target,
783  unsigned from,
784  unsigned to)
785 {
786  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
787  // see more at:
788  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
789 
790  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
791 
792  unsigned unroll_factor = 8;
793  unsigned len = to - from + 1;
794  unsigned len_unr = len - (len % unroll_factor);
795 
796  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
797  __m128i vect_target = _mm_set1_epi32(target);
798  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
799 
800  int mask;
801  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
802 
803  unsigned k = 0;
804  for (; k < len_unr; k+=unroll_factor)
805  {
806  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
807  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
808 
809  cmp_mask_ge = _mm_or_si128( // GT | EQ
810  _mm_cmpgt_epi32 (norm_vect40, norm_target),
811  _mm_cmpeq_epi32 (vect40, vect_target)
812  );
813  mask = _mm_movemask_epi8(cmp_mask_ge);
814  if (mask)
815  {
816  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
817  return from + k + (bsf / 4);
818  }
819  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
820  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
821 
822  cmp_mask_ge = _mm_or_si128(
823  _mm_cmpgt_epi32 (norm_vect41, norm_target),
824  _mm_cmpeq_epi32 (vect41, vect_target)
825  );
826  mask = _mm_movemask_epi8(cmp_mask_ge);
827  if (mask)
828  {
829  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
830  return 4 + from + k + (bsf / 4);
831  }
832  } // for
833 
834  for (; k < len; ++k)
835  {
836  if (arr_base[k] >= target)
837  return from + k;
838  }
839  return to + 1;
840 }
841 
842 
843 
844 /*!
845  SSE4.2 index lookup to check what belongs to the same block (8 elements)
846  \internal
847 */
848 inline
849 unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
850  unsigned nb, unsigned start)
851 {
852  const unsigned unroll_factor = 8;
853  const unsigned len = (size - start);
854  const unsigned len_unr = len - (len % unroll_factor);
855  unsigned k;
856 
857  idx += start;
858 
859  __m128i nbM = _mm_set1_epi32(nb);
860 
861  for (k = 0; k < len_unr; k+=unroll_factor)
862  {
863  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
864  __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
865  __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
866  __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
867 
868  if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
869  !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
870  break;
871 
872  } // for k
873  for (; k < len; ++k)
874  {
875  if (nb != unsigned(idx[k] >> bm::set_block_shift))
876  break;
877  }
878  return start + k;
879 }
880 
881 /*!
882  SSE4.2 bulk bit set
883  \internal
884 */
885 inline
887  const unsigned* BMRESTRICT idx,
888  unsigned start, unsigned stop )
889 {
890  const unsigned unroll_factor = 4;
891  const unsigned len = (stop - start);
892  const unsigned len_unr = len - (len % unroll_factor);
893 
894  idx += start;
895 
896  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
897  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
898 
899  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
900  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
901 
902  unsigned k = 0;
903  for (; k < len_unr; k+=unroll_factor)
904  {
905  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
906  __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
907  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
908 
909 
910  nbitA = _mm_and_si128 (nbitA, sw_mask);
911  _mm_store_si128 ((__m128i*)mshift_v, nbitA);
912 
913  // check-compare if all 4 bits are in the very same word
914  //
915  __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
916  __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
917  if (_mm_test_all_ones(cmpA)) // check if all are in one word
918  {
919  unsigned nword = _mm_extract_epi32(nwordA, 0);
920  block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
921  |(1u << mshift_v[2]) | (1u << mshift_v[3]);
922  }
923  else // bits are in different words, use scalar scatter
924  {
925  _mm_store_si128 ((__m128i*)mword_v, nwordA);
926 
927  block[mword_v[0]] |= (1u << mshift_v[0]);
928  block[mword_v[1]] |= (1u << mshift_v[1]);
929  block[mword_v[2]] |= (1u << mshift_v[2]);
930  block[mword_v[3]] |= (1u << mshift_v[3]);
931  }
932 
933  } // for k
934 
935  for (; k < len; ++k)
936  {
937  unsigned n = idx[k];
938  unsigned nbit = unsigned(n & bm::set_block_mask);
939  unsigned nword = nbit >> bm::set_word_shift;
940  nbit &= bm::set_word_mask;
941  block[nword] |= (1u << nbit);
942  } // for k
943 }
944 
945 
946 /*!
947  SSE4.2 bit block gather-scatter
948 
949  @param arr - destination array to set bits
950  @param blk - source bit-block
951  @param idx - gather index array
952  @param size - gather array size
953  @param start - gaher start index
954  @param bit_idx - bit to set in the target array
955 
956  \internal
957 
958  C algorithm:
959 
960  for (unsigned k = start; k < size; ++k)
961  {
962  nbit = unsigned(idx[k] & bm::set_block_mask);
963  nword = unsigned(nbit >> bm::set_word_shift);
964  mask0 = 1u << (nbit & bm::set_word_mask);
965  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
966  }
967 
968 */
969 inline
971  const unsigned* BMRESTRICT blk,
972  const unsigned* BMRESTRICT idx,
973  unsigned size,
974  unsigned start,
975  unsigned bit_idx)
976 {
977  const unsigned unroll_factor = 4;
978  const unsigned len = (size - start);
979  const unsigned len_unr = len - (len % unroll_factor);
980 
981  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
982  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
983  __m128i maskFF = _mm_set1_epi32(~0u);
984  __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
985 
986  __m128i mask_tmp, mask_0;
987 
988  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
989  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
990 
991  unsigned k = 0;
992  unsigned base = start + k;
993  __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
994  __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
995  for (; k < len_unr; k+=unroll_factor)
996  {
997  __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
998  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
999  // (nbit & bm::set_word_mask)
1000  _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1001  _mm_store_si128 ((__m128i*)mword_v, nwordA);
1002 
1003  // mask0 = 1u << (nbit & bm::set_word_mask);
1004  //
1005 #if 0
1006  // ifdefed an alternative SHIFT implementation using SSE and masks
1007  // (it is not faster than just doing scalar operations)
1008  {
1009  __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1010  __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1011  mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1012  mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1013  mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1014 
1015  __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1016  _mm_slli_si128 (am_0, 8));
1017  mask_tmp = _mm_and_si128 (
1018  _mm_slli_epi32(mask1, mshift_v[3]),
1019  _mm_slli_si128 (am_0, 12)
1020  );
1021 
1022  mask_0 = _mm_or_si128 (mask_0,
1023  _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1024  }
1025 #endif
1026  mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1027 
1028 
1029  // gather for: blk[nword] (.. & mask0 )
1030  //
1031  mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1032  blk[mword_v[1]], blk[mword_v[0]]),
1033  mask_0);
1034 
1035  // bool(blk[nword] ...)
1036  //maskFF = _mm_set1_epi32(~0u);
1037  mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1038  mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1039  mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1040 
1041  mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1042 
1043  _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1044  _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1045 
1046  ++idx_ptr; ++target_ptr;
1047  _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1048  }
1049 
1050  for (; k < len; ++k)
1051  {
1052  base = start + k;
1053  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1054  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1055  }
1056 
1057 }
1058 
1059 /*!
1060  @brief block shift right by 1
1061  @ingroup SSE4
1062 */
1063 inline
1064 bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1)
1065 {
1066  __m128i* block_end =
1067  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1068  __m128i m1COshft, m2COshft;
1069  __m128i mAcc = _mm_set1_epi32(0);
1070 
1071  unsigned co2;
1072 
1073  for (;block < block_end; block += 2)
1074  {
1075  __m128i m1A = _mm_load_si128(block);
1076  __m128i m2A = _mm_load_si128(block+1);
1077 
1078  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1079  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1080 
1081  co2 = _mm_extract_epi32(m1CO, 3);
1082 
1083  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1084  m2A = _mm_slli_epi32(m2A, 1);
1085 
1086  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1087  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1088 
1089  co1 = co2;
1090 
1091  co2 = _mm_extract_epi32(m2CO, 3);
1092 
1093  m2COshft = _mm_slli_si128 (m2CO, 4);
1094  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1095 
1096  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1097  m2A = _mm_or_si128(m2A, m2COshft);
1098 
1099  _mm_store_si128(block, m1A);
1100  _mm_store_si128(block+1, m2A);
1101 
1102  mAcc = _mm_or_si128(mAcc, m1A);
1103  mAcc = _mm_or_si128(mAcc, m2A);
1104 
1105  co1 = co2;
1106  }
1107  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1108  return co1;
1109 }
1110 
1111 
1112 /*!
1113  @brief block shift right by 1 plus AND
1114 
1115  @return carry over flag
1116  @ingroup SSE4
1117 */
1118 inline
1119 bool sse42_shift_r1_and(__m128i* block,
1120  bm::word_t co1,
1121  const __m128i* BMRESTRICT mask_block,
1122  bm::id64_t* digest)
1123 {
1124  bm::word_t* wblock = (bm::word_t*) block;
1125  const bm::word_t* mblock = (const bm::word_t*) mask_block;
1126 
1127  __m128i m1COshft, m2COshft;
1128  __m128i mAcc = _mm_set1_epi32(0);
1129  unsigned co2;
1130 
1131  bm::id64_t d, wd;
1132  wd = d = *digest;
1133 
1134  unsigned di = 0;
1135  if (!co1)
1136  {
1137  bm::id64_t t = d & -d;
1138  di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1139  }
1140 
1141  for (; di < 64 ; ++di)
1142  {
1143  const unsigned d_base = di * bm::set_block_digest_wave_size;
1144  bm::id64_t dmask = (1ull << di);
1145  if (d & dmask) // digest stride NOT empty
1146  {
1147  block = (__m128i*) &wblock[d_base];
1148  mask_block = (__m128i*) &mblock[d_base];
1149  for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1150  {
1151  __m128i m1A = _mm_load_si128(block);
1152  __m128i m2A = _mm_load_si128(block+1);
1153 
1154  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1155  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1156 
1157  co2 = _mm_extract_epi32(m1CO, 3);
1158 
1159  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1160  m2A = _mm_slli_epi32(m2A, 1);
1161 
1162  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1163  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1164 
1165  co1 = co2;
1166 
1167  co2 = _mm_extract_epi32(m2CO, 3);
1168 
1169  m2COshft = _mm_slli_si128 (m2CO, 4);
1170  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1171 
1172  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1173  m2A = _mm_or_si128(m2A, m2COshft);
1174 
1175  m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1176  m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1177 
1178  mAcc = _mm_or_si128(mAcc, m1A);
1179  mAcc = _mm_or_si128(mAcc, m2A);
1180 
1181  _mm_store_si128(block, m1A);
1182  _mm_store_si128(block+1, m2A);
1183 
1184  co1 = co2;
1185 
1186  } // for i
1187 
1188  if (_mm_testz_si128(mAcc, mAcc))
1189  d &= ~dmask; // clear digest bit
1190  wd &= wd - 1;
1191  }
1192  else
1193  {
1194  if (co1)
1195  {
1196  BM_ASSERT(co1 == 1);
1197  BM_ASSERT(wblock[d_base] == 0);
1198 
1199  unsigned w0 = wblock[d_base] = co1 & mblock[d_base];
1200  d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1201  co1 = 0;
1202  }
1203  if (!wd) // digest is empty, no CO -> exit
1204  break;
1205  }
1206  } // for di
1207 
1208  *digest = d;
1209  return co1;
1210 }
1211 
1212 
1213 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1214  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1215 
1216 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1217  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1218 
1219 #define VECT_BITCOUNT(first, last) \
1220  sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1221 
1222 #define VECT_BITCOUNT_AND(first, last, mask) \
1223  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1224 
1225 #define VECT_BITCOUNT_OR(first, last, mask) \
1226  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1227 
1228 #define VECT_BITCOUNT_XOR(first, last, mask) \
1229  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1230 
1231 #define VECT_BITCOUNT_SUB(first, last, mask) \
1232  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1233 
1234 #define VECT_INVERT_BLOCK(first) \
1235  sse2_invert_block((__m128i*)first);
1236 
1237 #define VECT_AND_BLOCK(dst, src) \
1238  sse4_and_block((__m128i*) dst, (__m128i*) (src))
1239 
1240 #define VECT_AND_DIGEST(dst, src) \
1241  sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1242 
1243 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1244  sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1245 
1246 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1247  sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1248 
1249 #define VECT_OR_BLOCK(dst, src) \
1250  sse2_or_block((__m128i*) dst, (__m128i*) (src))
1251 
1252 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1253  sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1254 
1255 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1256  sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1257 
1258 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1259  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1260 
1261 #define VECT_SUB_BLOCK(dst, src) \
1262  sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1263 
1264 #define VECT_SUB_DIGEST(dst, src) \
1265  sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1266 
1267 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1268  sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1269 
1270 #define VECT_XOR_BLOCK(dst, src) \
1271  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1272 
1273 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1274  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1275 
1276 #define VECT_COPY_BLOCK(dst, src) \
1277  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1278 
1279 #define VECT_STREAM_BLOCK(dst, src) \
1280  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1281 
1282 #define VECT_SET_BLOCK(dst, value) \
1283  sse2_set_block((__m128i*) dst, value)
1284 
1285 #define VECT_IS_ZERO_BLOCK(dst) \
1286  sse4_is_all_zero((__m128i*) dst)
1287 
1288 #define VECT_IS_ONE_BLOCK(dst) \
1289  sse4_is_all_one((__m128i*) dst)
1290 
1291 #define VECT_IS_DIGEST_ZERO(start) \
1292  sse4_is_digest_zero((__m128i*)start)
1293 
1294 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1295  sse4_block_set_digest((__m128i*)dst, val)
1296 
1297 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1298  sse4_lower_bound_scan_u32(arr, target, from, to)
1299 
1300 #define VECT_SHIFT_R1(b, acc, co) \
1301  sse42_shift_r1((__m128i*)b, acc, co)
1302 
1303 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
1304  sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1305 
1306 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1307  sse42_idx_arr_block_lookup(idx, size, nb, start)
1308 
1309 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1310  sse42_set_block_bits(block, idx, start, stop)
1311 
1312 #define VECT_BLOCK_CHANGE(block) \
1313  sse42_bit_block_calc_change((__m128i*)block)
1314 
1315 
1316 #ifdef __GNUG__
1317 #pragma GCC diagnostic pop
1318 #endif
1319 
1320 #ifdef _MSC_VER
1321 #pragma warning( pop )
1322 #endif
1323 
1324 } // namespace
1325 
1326 
1327 
1328 
1329 #endif
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 waves of pointers are all NULL
Definition: bmsse4.h:595
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse4.h:781
const unsigned set_block_size
Definition: bmconst.h:48
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse4.h:78
const unsigned set_word_shift
Definition: bmconst.h:64
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
Definition: bmsse4.h:1064
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:175
unsigned long long int id64_t
Definition: bmconst.h:32
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:332
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block)
Definition: bmsse4.h:622
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse4.h:133
Definition: bm.h:69
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if wave of 2 pointers are the same (null or FULL)
Definition: bmsse4.h:608
#define BM_ALIGN16
Definition: bmdef.h:273
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:559
Compute functions for SSE SIMD instruction set (internal)
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition: bmsse4.h:886
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition: bmsse4.h:126
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmsse4.h:584
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmsse4.h:849
unsigned int word_t
Definition: bmconst.h:36
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array...
Definition: bmsse4.h:750
bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse4.h:511
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse4.h:689
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition: bmsse4.h:117
unsigned short gap_word_t
Definition: bmconst.h:70
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmsse4.h:284
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
Definition: bmsse4.h:379
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmsse4.h:970
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmsse4.h:200
Definitions(internal)
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:462
#define BM_ALIGN16ATTR
Definition: bmdef.h:274
const unsigned set_block_mask
Definition: bmconst.h:50
void sse4_block_set_digest(__m128i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse4.h:219
const unsigned set_word_mask
Definition: bmconst.h:65
#define BMFORCEINLINE
Definition: bmdef.h:189
unsigned int id_t
Definition: bmconst.h:35
#define BM_ASSERT
Definition: bmdef.h:116
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition: bmsse4.h:107
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
Definition: bmsse4.h:1119
const unsigned set_block_shift
Definition: bmconst.h:49
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse4.h:237
Bit manipulation primitives (internal)
#define BMRESTRICT
Definition: bmdef.h:179
const unsigned set_block_digest_wave_size
Definition: bmconst.h:59