BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse4.h
22  \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 #include<smmintrin.h>
28 #include<nmmintrin.h>
29 #include<immintrin.h>
30 
31 #include "bmdef.h"
32 #include "bmsse_util.h"
33 #include "bmutil.h"
34 
35 namespace bm
36 {
37 
38 /** @defgroup SSE4 SSE4.2 funcions (internal)
39  Processor specific optimizations for SSE4.2 instructions (internals)
40  @internal
41  @ingroup bvector
42  */
43 
44 #ifdef __GNUG__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
47 #endif
48 
49 #ifdef _MSC_VER
50 #pragma warning( push )
51 #pragma warning( disable : 4146)
52 #endif
53 
54 
55 /*
56 inline
57 void sse2_print128(const char* prefix, const __m128i & value)
58 {
59  const size_t n = sizeof(__m128i) / sizeof(unsigned);
60  unsigned buffer[n];
61  _mm_storeu_si128((__m128i*)buffer, value);
62  std::cout << prefix << " [ ";
63  for (int i = n-1; 1; --i)
64  {
65  std::cout << buffer[i] << " ";
66  if (i == 0)
67  break;
68  }
69  std::cout << "]" << std::endl;
70 }
71 */
72 
73 /*!
74  SSE4.2 optimized bitcounting .
75  @ingroup SSE4
76 */
77 inline
78 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
79 {
80  bm::id_t count = 0;
81 #ifdef BM64_SSE4
82  const bm::id64_t* b = (bm::id64_t*) block;
83  const bm::id64_t* b_end = (bm::id64_t*) block_end;
84  do
85  {
86  count += unsigned( _mm_popcnt_u64(b[0]) +
87  _mm_popcnt_u64(b[1]));
88  b += 2;
89  } while (b < b_end);
90 #else
91  do
92  {
93  const unsigned* b = (unsigned*) block;
94  count += _mm_popcnt_u32(b[0]) +
95  _mm_popcnt_u32(b[1]) +
96  _mm_popcnt_u32(b[2]) +
97  _mm_popcnt_u32(b[3]);
98  } while (++block < block_end);
99 #endif
100  return count;
101 }
102 
103 /*!
104 \internal
105 */
107 unsigned op_xor(unsigned a, unsigned b)
108 {
109  unsigned ret = (a ^ b);
110  return ret;
111 }
112 
113 /*!
114 \internal
115 */
117 unsigned op_or(unsigned a, unsigned b)
118 {
119  return (a | b);
120 }
121 
122 /*!
123 \internal
124 */
126 unsigned op_and(unsigned a, unsigned b)
127 {
128  return (a & b);
129 }
130 
131 
132 template<class Func>
133 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
134  const __m128i* BMRESTRICT block_end,
135  const __m128i* BMRESTRICT mask_block,
136  Func sse2_func)
137 {
138  bm::id_t count = 0;
139 #ifdef BM64_SSE4
140  do
141  {
142  __m128i tmp0 = _mm_load_si128(block);
143  __m128i tmp1 = _mm_load_si128(mask_block);
144  __m128i b = sse2_func(tmp0, tmp1);
145 
146  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
148 
149  ++block; ++mask_block;
150  } while (block < block_end);
151 #else
152  do
153  {
154  __m128i tmp0 = _mm_load_si128(block);
155  __m128i tmp1 = _mm_load_si128(mask_block);
156  __m128i b = sse2_func(tmp0, tmp1);
157 
158  count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159  count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160  count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161  count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
162 
163  ++block; ++mask_block;
164  } while (block < block_end);
165 #endif
166 
167  return count;
168 }
169 
170 /*!
171  @brief check if block is all zero bits
172  @ingroup SSE4
173 */
174 inline
175 bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
176 {
177  __m128i w;
178  __m128i maskz = _mm_setzero_si128();
179  const __m128i* BMRESTRICT block_end =
180  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
181 
182  do
183  {
184  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
186  return false;
187  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
189  return false;
190  block += 4;
191  } while (block < block_end);
192  return true;
193 }
194 
195 /*!
196  @brief check if digest stride is all zero bits
197  @ingroup SSE4
198 */
199 inline
200 bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
201 {
202  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204  wA = _mm_or_si128(wA, wB);
205  bool z1 = _mm_test_all_zeros(wA, wA);
206 
207  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209  wA = _mm_or_si128(wA, wB);
210  bool z2 = _mm_test_all_zeros(wA, wA);
211  return z1 & z2;
212 }
213 
214 /*!
215  @brief set digest stride to 0xFF.. or 0x0 value
216  @ingroup SSE4
217 */
218 inline
219 void sse4_block_set_digest(__m128i* dst, unsigned value)
220 {
221  __m128i mV = _mm_set1_epi32(int(value));
222  _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223  _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224  _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225  _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
226 }
227 
228 
229 /*!
230  @brief AND blocks2
231  *dst &= *src
232 
233  @return 0 if no bits were set
234  @ingroup SSE4
235 */
236 inline
237 unsigned sse4_and_block(__m128i* BMRESTRICT dst,
238  const __m128i* BMRESTRICT src)
239 {
240  __m128i m1A, m1B, m1C, m1D;
241  __m128i accA, accB, accC, accD;
242 
243  const __m128i* BMRESTRICT src_end =
244  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
245 
246  accA = accB = accC = accD = _mm_setzero_si128();
247 
248  do
249  {
250  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
254 
255  _mm_store_si128(dst+0, m1A);
256  _mm_store_si128(dst+1, m1B);
257  _mm_store_si128(dst+2, m1C);
258  _mm_store_si128(dst+3, m1D);
259 
260  accA = _mm_or_si128(accA, m1A);
261  accB = _mm_or_si128(accB, m1B);
262  accC = _mm_or_si128(accC, m1C);
263  accD = _mm_or_si128(accD, m1D);
264 
265  src += 4; dst += 4;
266  } while (src < src_end);
267 
268  accA = _mm_or_si128(accA, accB); // A = A | B
269  accC = _mm_or_si128(accC, accD); // C = C | D
270  accA = _mm_or_si128(accA, accC); // A = A | C
271 
272  return !_mm_testz_si128(accA, accA);
273 }
274 
275 
276 /*!
277  @brief AND block digest stride
278  *dst &= *src
279 
280  @return true if stide is all zero
281  @ingroup SSE4
282 */
283 inline
284 bool sse4_and_digest(__m128i* BMRESTRICT dst,
285  const __m128i* BMRESTRICT src)
286 {
287  __m128i m1A, m1B, m1C, m1D;
288 
289  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
293 
294  _mm_store_si128(dst+0, m1A);
295  _mm_store_si128(dst+1, m1B);
296  _mm_store_si128(dst+2, m1C);
297  _mm_store_si128(dst+3, m1D);
298 
299  m1A = _mm_or_si128(m1A, m1B);
300  m1C = _mm_or_si128(m1C, m1D);
301  m1A = _mm_or_si128(m1A, m1C);
302 
303  bool z1 = _mm_testz_si128(m1A, m1A);
304 
305  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
309 
310  _mm_store_si128(dst+4, m1A);
311  _mm_store_si128(dst+5, m1B);
312  _mm_store_si128(dst+6, m1C);
313  _mm_store_si128(dst+7, m1D);
314 
315  m1A = _mm_or_si128(m1A, m1B);
316  m1C = _mm_or_si128(m1C, m1D);
317  m1A = _mm_or_si128(m1A, m1C);
318 
319  bool z2 = _mm_testz_si128(m1A, m1A);
320 
321  return z1 & z2;
322 }
323 
324 /*!
325  @brief AND block digest stride
326  *dst = *src1 & src2
327 
328  @return true if stide is all zero
329  @ingroup SSE4
330 */
331 inline
333  const __m128i* BMRESTRICT src1,
334  const __m128i* BMRESTRICT src2)
335 {
336  __m128i m1A, m1B, m1C, m1D;
337 
338  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
342 
343  _mm_store_si128(dst+0, m1A);
344  _mm_store_si128(dst+1, m1B);
345  _mm_store_si128(dst+2, m1C);
346  _mm_store_si128(dst+3, m1D);
347 
348  m1A = _mm_or_si128(m1A, m1B);
349  m1C = _mm_or_si128(m1C, m1D);
350  m1A = _mm_or_si128(m1A, m1C);
351 
352  bool z1 = _mm_testz_si128(m1A, m1A);
353 
354  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
358 
359  _mm_store_si128(dst+4, m1A);
360  _mm_store_si128(dst+5, m1B);
361  _mm_store_si128(dst+6, m1C);
362  _mm_store_si128(dst+7, m1D);
363 
364  m1A = _mm_or_si128(m1A, m1B);
365  m1C = _mm_or_si128(m1C, m1D);
366  m1A = _mm_or_si128(m1A, m1C);
367 
368  bool z2 = _mm_testz_si128(m1A, m1A);
369 
370  return z1 & z2;
371 }
372 
373 /*!
374  @brief AND block digest stride
375  @return true if stide is all zero
376  @ingroup SSE4
377 */
378 inline
380  const __m128i* BMRESTRICT src1,
381  const __m128i* BMRESTRICT src2,
382  const __m128i* BMRESTRICT src3,
383  const __m128i* BMRESTRICT src4)
384 {
385  __m128i m1A, m1B, m1C, m1D;
386  __m128i m1E, m1F, m1G, m1H;
387 
388  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
392 
393  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
397 
398  m1A = _mm_and_si128(m1A, m1E);
399  m1B = _mm_and_si128(m1B, m1F);
400  m1C = _mm_and_si128(m1C, m1G);
401  m1D = _mm_and_si128(m1D, m1H);
402 
403  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
407 
408  _mm_store_si128(dst+0, m1A);
409  _mm_store_si128(dst+1, m1B);
410  _mm_store_si128(dst+2, m1C);
411  _mm_store_si128(dst+3, m1D);
412 
413  m1A = _mm_or_si128(m1A, m1B);
414  m1C = _mm_or_si128(m1C, m1D);
415  m1A = _mm_or_si128(m1A, m1C);
416 
417  bool z1 = _mm_testz_si128(m1A, m1A);
418 
419  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
423 
424  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
428 
429  m1A = _mm_and_si128(m1A, m1E);
430  m1B = _mm_and_si128(m1B, m1F);
431  m1C = _mm_and_si128(m1C, m1G);
432  m1D = _mm_and_si128(m1D, m1H);
433 
434  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
438 
439  _mm_store_si128(dst+4, m1A);
440  _mm_store_si128(dst+5, m1B);
441  _mm_store_si128(dst+6, m1C);
442  _mm_store_si128(dst+7, m1D);
443 
444  m1A = _mm_or_si128(m1A, m1B);
445  m1C = _mm_or_si128(m1C, m1D);
446  m1A = _mm_or_si128(m1A, m1C);
447 
448  bool z2 = _mm_testz_si128(m1A, m1A);
449 
450  return z1 & z2;
451 }
452 
453 
454 /*!
455  @brief SUB (AND NOT) block digest stride
456  *dst &= ~*src
457 
458  @return true if stide is all zero
459  @ingroup SSE4
460 */
461 inline
462 bool sse4_sub_digest(__m128i* BMRESTRICT dst,
463  const __m128i* BMRESTRICT src)
464 {
465  __m128i m1A, m1B, m1C, m1D;
466 
467  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
471 
472  _mm_store_si128(dst+0, m1A);
473  _mm_store_si128(dst+1, m1B);
474  _mm_store_si128(dst+2, m1C);
475  _mm_store_si128(dst+3, m1D);
476 
477  m1A = _mm_or_si128(m1A, m1B);
478  m1C = _mm_or_si128(m1C, m1D);
479  m1A = _mm_or_si128(m1A, m1C);
480 
481  bool z1 = _mm_testz_si128(m1A, m1A);
482 
483  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
487 
488  _mm_store_si128(dst+4, m1A);
489  _mm_store_si128(dst+5, m1B);
490  _mm_store_si128(dst+6, m1C);
491  _mm_store_si128(dst+7, m1D);
492 
493  m1A = _mm_or_si128(m1A, m1B);
494  m1C = _mm_or_si128(m1C, m1D);
495  m1A = _mm_or_si128(m1A, m1C);
496 
497  bool z2 = _mm_testz_si128(m1A, m1A);
498 
499  return z1 & z2;
500 }
501 
502 
503 /*!
504  @brief 2-operand SUB (AND NOT) block digest stride
505  *dst = src1 & ~*src2
506 
507  @return true if stide is all zero
508  @ingroup SSE4
509 */
510 inline
512  const __m128i* BMRESTRICT src1,
513  const __m128i* BMRESTRICT src2)
514 {
515  __m128i m1A, m1B, m1C, m1D;
516 
517  m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518  m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519  m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520  m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
521 
522  _mm_store_si128(dst+0, m1A);
523  _mm_store_si128(dst+1, m1B);
524  _mm_store_si128(dst+2, m1C);
525  _mm_store_si128(dst+3, m1D);
526 
527  m1A = _mm_or_si128(m1A, m1B);
528  m1C = _mm_or_si128(m1C, m1D);
529  m1A = _mm_or_si128(m1A, m1C);
530 
531  bool z1 = _mm_testz_si128(m1A, m1A);
532 
533  m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534  m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535  m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536  m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
537 
538  _mm_store_si128(dst+4, m1A);
539  _mm_store_si128(dst+5, m1B);
540  _mm_store_si128(dst+6, m1C);
541  _mm_store_si128(dst+7, m1D);
542 
543  m1A = _mm_or_si128(m1A, m1B);
544  m1C = _mm_or_si128(m1C, m1D);
545  m1A = _mm_or_si128(m1A, m1C);
546 
547  bool z2 = _mm_testz_si128(m1A, m1A);
548 
549  return z1 & z2;
550 }
551 
552 
553 
554 /*!
555  @brief check if block is all zero bits
556  @ingroup SSE4
557 */
558 inline
559 bool sse4_is_all_one(const __m128i* BMRESTRICT block)
560 {
561  __m128i w;
562  const __m128i* BMRESTRICT block_end =
563  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
564 
565  do
566  {
567  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568  if (!_mm_test_all_ones(w))
569  return false;
570  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571  if (!_mm_test_all_ones(w))
572  return false;
573 
574  block+=4;
575  } while (block < block_end);
576  return true;
577 }
578 
579 /*!
580  @brief check if wave of pointers is all NULL
581  @ingroup SSE4
582 */
584 bool sse42_test_all_zero_wave(const void* ptr)
585 {
586  __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
587  return _mm_testz_si128(w0, w0);
588 }
589 
590 /*!
591  @brief check if 2 waves of pointers are all NULL
592  @ingroup SSE4
593 */
595 bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1)
596 {
597  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
598  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
599  w0 = _mm_or_si128(w0, w1);
600  return _mm_testz_si128(w0, w0);
601 }
602 
603 /*!
604  @brief check if wave of 2 pointers are the same (null or FULL)
605  @ingroup SSE4
606 */
608 bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1)
609 {
610  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
611  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
612  w0 = _mm_xor_si128(w0, w1);
613  return _mm_testz_si128(w0, w0);
614 }
615 
616 
617 /*!
618  SSE4.2 calculate number of bit changes from 0 to 1
619  @ingroup SSE4
620 */
621 inline
622 unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block)
623 {
624  const __m128i* block_end =
625  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
626  __m128i m1COshft, m2COshft;
627 
628  unsigned w0 = *((bm::word_t*)(block));
629  unsigned count = 1;
630 
631  unsigned co2, co1 = 0;
632  for (;block < block_end; block += 2)
633  {
634  __m128i m1A = _mm_load_si128(block);
635  __m128i m2A = _mm_load_si128(block+1);
636 
637  __m128i m1CO = _mm_srli_epi32(m1A, 31);
638  __m128i m2CO = _mm_srli_epi32(m2A, 31);
639 
640  co2 = _mm_extract_epi32(m1CO, 3);
641 
642  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
643  __m128i m2As = _mm_slli_epi32(m2A, 1);
644 
645  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
646  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
647 
648  co1 = co2;
649 
650  co2 = _mm_extract_epi32(m2CO, 3);
651 
652  m2COshft = _mm_slli_si128 (m2CO, 4);
653  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
654 
655  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
656  m2As = _mm_or_si128(m2As, m2COshft);
657 
658  co1 = co2;
659 
660  // we now have two shifted SSE4 regs with carry-over
661  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
662  m2A = _mm_xor_si128(m2A, m2As);
663 
664 #ifdef BM64_SSE4
665  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
666  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
667  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
668 
669  m0 = _mm_extract_epi64(m2A, 0);
670  m1 = _mm_extract_epi64(m2A, 1);
671  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
672 #else
673  bm::id_t m0 = _mm_extract_epi32(m1A, 0);
674  bm::id_t m1 = _mm_extract_epi32(m1A, 1);
675  bm::id_t m2 = _mm_extract_epi32(m1A, 2);
676  bm::id_t m3 = _mm_extract_epi32(m1A, 3);
677  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
678  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
679 
680  m0 = _mm_extract_epi32(m2A, 0);
681  m1 = _mm_extract_epi32(m2A, 1);
682  m2 = _mm_extract_epi32(m2A, 2);
683  m3 = _mm_extract_epi32(m2A, 3);
684  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
685  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
686 #endif
687 
688  }
689  count -= (w0 & 1u); // correct initial carry-in error
690  return count;
691 }
692 
693 
694 #ifdef __GNUG__
695 // necessary measure to silence false warning from GCC about negative pointer arithmetics
696 #pragma GCC diagnostic push
697 #pragma GCC diagnostic ignored "-Warray-bounds"
698 #endif
699 
700 /*!
701  SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
702  @ingroup SSE4
703  \internal
704 */
705 inline
706 unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
707 {
708  BM_ASSERT(size <= 16);
709  BM_ASSERT(size);
710 
711  const unsigned unroll_factor = 8;
712  if (size < 4) // for very short vector use conventional scan
713  {
714  unsigned j;
715  for (j = 0; j < size; ++j)
716  {
717  if (pbuf[j] >= pos)
718  break;
719  }
720  return j;
721  }
722 
723  __m128i m1, mz, maskF, maskFL;
724 
725  mz = _mm_setzero_si128();
726  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
727 
728  maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
729  maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
730  int shiftL= (64 - (unroll_factor - size) * 16);
731  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
732 
733  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
734  m1 = _mm_or_si128(m1, maskFL);
735 
736  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
737  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
738  __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
739  int mi = _mm_movemask_epi8(c_mask); // collect flag bits
740  if (mi)
741  {
742  // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
743  unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
744  return unroll_factor - bc; // address of first one element (target)
745  }
746  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
747  // GCC gives a false warning for "- unroll_factor" here
748  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
749  BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
750 
751  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
752  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
753  mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
754  unsigned bc = _mm_popcnt_u32(mi);
755 
756  return size - bc;
757 }
758 
759 /**
760  Experimental (test) function to do SIMD vector search (lower bound)
761  in sorted, growing array
762  @ingroup SSE4
763 
764  \internal
765 */
766 inline
767 int sse42_cmpge_u32(__m128i vect4, unsigned value)
768 {
769  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
770  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
771  //
772  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
773  __m128i mm_val = _mm_set1_epi32(value);
774 
775  __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
776  __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
777 
778  __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
779  __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
780 
781  __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
782  int mask = _mm_movemask_epi8(cmp_mask_ge);
783  if (mask)
784  {
785  int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
786  return bsf / 4;
787  }
788  return -1;
789 }
790 
791 
792 /**
793  lower bound (great or equal) linear scan in ascending order sorted array
794  @ingroup SSE4
795  \internal
796 */
797 inline
798 unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
799  unsigned target,
800  unsigned from,
801  unsigned to)
802 {
803  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
804  // see more at:
805  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
806 
807  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
808 
809  unsigned unroll_factor = 8;
810  unsigned len = to - from + 1;
811  unsigned len_unr = len - (len % unroll_factor);
812 
813  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
814  __m128i vect_target = _mm_set1_epi32(target);
815  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
816 
817  int mask;
818  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
819 
820  unsigned k = 0;
821  for (; k < len_unr; k+=unroll_factor)
822  {
823  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
824  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
825 
826  cmp_mask_ge = _mm_or_si128( // GT | EQ
827  _mm_cmpgt_epi32 (norm_vect40, norm_target),
828  _mm_cmpeq_epi32 (vect40, vect_target)
829  );
830  mask = _mm_movemask_epi8(cmp_mask_ge);
831  if (mask)
832  {
833  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
834  return from + k + (bsf / 4);
835  }
836  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
837  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
838 
839  cmp_mask_ge = _mm_or_si128(
840  _mm_cmpgt_epi32 (norm_vect41, norm_target),
841  _mm_cmpeq_epi32 (vect41, vect_target)
842  );
843  mask = _mm_movemask_epi8(cmp_mask_ge);
844  if (mask)
845  {
846  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
847  return 4 + from + k + (bsf / 4);
848  }
849  } // for
850 
851  for (; k < len; ++k)
852  {
853  if (arr_base[k] >= target)
854  return from + k;
855  }
856  return to + 1;
857 }
858 
859 
860 
861 /*!
862  SSE4.2 index lookup to check what belongs to the same block (8 elements)
863  \internal
864 */
865 inline
866 unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
867  unsigned nb, unsigned start)
868 {
869  const unsigned unroll_factor = 8;
870  const unsigned len = (size - start);
871  const unsigned len_unr = len - (len % unroll_factor);
872  unsigned k;
873 
874  idx += start;
875 
876  __m128i nbM = _mm_set1_epi32(nb);
877 
878  for (k = 0; k < len_unr; k+=unroll_factor)
879  {
880  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
881  __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
882  __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
883  __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
884 
885  if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
886  !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
887  break;
888 
889  } // for k
890  for (; k < len; ++k)
891  {
892  if (nb != unsigned(idx[k] >> bm::set_block_shift))
893  break;
894  }
895  return start + k;
896 }
897 
898 /*!
899  SSE4.2 bulk bit set
900  \internal
901 */
902 inline
904  const unsigned* BMRESTRICT idx,
905  unsigned start, unsigned stop )
906 {
907  const unsigned unroll_factor = 4;
908  const unsigned len = (stop - start);
909  const unsigned len_unr = len - (len % unroll_factor);
910 
911  idx += start;
912 
913  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
914  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
915 
916  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
917  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
918 
919  unsigned k = 0;
920  for (; k < len_unr; k+=unroll_factor)
921  {
922  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
923  __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
924  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
925 
926 
927  nbitA = _mm_and_si128 (nbitA, sw_mask);
928  _mm_store_si128 ((__m128i*)mshift_v, nbitA);
929 
930  // check-compare if all 4 bits are in the very same word
931  //
932  __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
933  __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
934  if (_mm_test_all_ones(cmpA)) // check if all are in one word
935  {
936  unsigned nword = _mm_extract_epi32(nwordA, 0);
937  block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
938  |(1u << mshift_v[2]) | (1u << mshift_v[3]);
939  }
940  else // bits are in different words, use scalar scatter
941  {
942  _mm_store_si128 ((__m128i*)mword_v, nwordA);
943 
944  block[mword_v[0]] |= (1u << mshift_v[0]);
945  block[mword_v[1]] |= (1u << mshift_v[1]);
946  block[mword_v[2]] |= (1u << mshift_v[2]);
947  block[mword_v[3]] |= (1u << mshift_v[3]);
948  }
949 
950  } // for k
951 
952  for (; k < len; ++k)
953  {
954  unsigned n = idx[k];
955  unsigned nbit = unsigned(n & bm::set_block_mask);
956  unsigned nword = nbit >> bm::set_word_shift;
957  nbit &= bm::set_word_mask;
958  block[nword] |= (1u << nbit);
959  } // for k
960 }
961 
962 
963 /*!
964  SSE4.2 bit block gather-scatter
965 
966  @param arr - destination array to set bits
967  @param blk - source bit-block
968  @param idx - gather index array
969  @param size - gather array size
970  @param start - gaher start index
971  @param bit_idx - bit to set in the target array
972 
973  \internal
974 
975  C algorithm:
976 
977  for (unsigned k = start; k < size; ++k)
978  {
979  nbit = unsigned(idx[k] & bm::set_block_mask);
980  nword = unsigned(nbit >> bm::set_word_shift);
981  mask0 = 1u << (nbit & bm::set_word_mask);
982  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
983  }
984 
985 */
986 inline
988  const unsigned* BMRESTRICT blk,
989  const unsigned* BMRESTRICT idx,
990  unsigned size,
991  unsigned start,
992  unsigned bit_idx)
993 {
994  const unsigned unroll_factor = 4;
995  const unsigned len = (size - start);
996  const unsigned len_unr = len - (len % unroll_factor);
997 
998  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
999  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1000  __m128i maskFF = _mm_set1_epi32(~0u);
1001  __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1002 
1003  __m128i mask_tmp, mask_0;
1004 
1005  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1006  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1007 
1008  unsigned k = 0;
1009  unsigned base = start + k;
1010  __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
1011  __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
1012  for (; k < len_unr; k+=unroll_factor)
1013  {
1014  __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
1015  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1016  // (nbit & bm::set_word_mask)
1017  _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1018  _mm_store_si128 ((__m128i*)mword_v, nwordA);
1019 
1020  // mask0 = 1u << (nbit & bm::set_word_mask);
1021  //
1022 #if 0
1023  // ifdefed an alternative SHIFT implementation using SSE and masks
1024  // (it is not faster than just doing scalar operations)
1025  {
1026  __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1027  __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1028  mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1029  mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1030  mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1031 
1032  __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1033  _mm_slli_si128 (am_0, 8));
1034  mask_tmp = _mm_and_si128 (
1035  _mm_slli_epi32(mask1, mshift_v[3]),
1036  _mm_slli_si128 (am_0, 12)
1037  );
1038 
1039  mask_0 = _mm_or_si128 (mask_0,
1040  _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1041  }
1042 #endif
1043  mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1044 
1045 
1046  // gather for: blk[nword] (.. & mask0 )
1047  //
1048  mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1049  blk[mword_v[1]], blk[mword_v[0]]),
1050  mask_0);
1051 
1052  // bool(blk[nword] ...)
1053  //maskFF = _mm_set1_epi32(~0u);
1054  mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1055  mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1056  mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1057 
1058  mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1059 
1060  _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1061  _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1062 
1063  ++idx_ptr; ++target_ptr;
1064  _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1065  }
1066 
1067  for (; k < len; ++k)
1068  {
1069  base = start + k;
1070  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1071  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1072  }
1073 
1074 }
1075 
1076 /*!
1077  @brief block shift right by 1
1078  @ingroup SSE4
1079 */
1080 inline
1081 bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1)
1082 {
1083  __m128i* block_end =
1084  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1085  __m128i m1COshft, m2COshft;
1086  __m128i mAcc = _mm_set1_epi32(0);
1087 
1088  unsigned co2;
1089 
1090  for (;block < block_end; block += 2)
1091  {
1092  __m128i m1A = _mm_load_si128(block);
1093  __m128i m2A = _mm_load_si128(block+1);
1094 
1095  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1096  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1097 
1098  co2 = _mm_extract_epi32(m1CO, 3);
1099 
1100  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1101  m2A = _mm_slli_epi32(m2A, 1);
1102 
1103  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1104  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1105 
1106  co1 = co2;
1107 
1108  co2 = _mm_extract_epi32(m2CO, 3);
1109 
1110  m2COshft = _mm_slli_si128 (m2CO, 4);
1111  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1112 
1113  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1114  m2A = _mm_or_si128(m2A, m2COshft);
1115 
1116  _mm_store_si128(block, m1A);
1117  _mm_store_si128(block+1, m2A);
1118 
1119  mAcc = _mm_or_si128(mAcc, m1A);
1120  mAcc = _mm_or_si128(mAcc, m2A);
1121 
1122  co1 = co2;
1123  }
1124  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1125  return co1;
1126 }
1127 
1128 
1129 
1130 /*!
1131  @brief block shift right by 1 plus AND
1132 
1133  @return carry over flag
1134  @ingroup SSE4
1135 */
1136 inline
1137 bool sse42_shift_r1_and(__m128i* block,
1138  bm::word_t co1,
1139  const __m128i* BMRESTRICT mask_block,
1140  bm::id64_t* digest)
1141 {
1142  bm::word_t* wblock = (bm::word_t*) block;
1143  const bm::word_t* mblock = (const bm::word_t*) mask_block;
1144 
1145  __m128i m1COshft, m2COshft;
1146  __m128i mAcc = _mm_set1_epi32(0);
1147  unsigned co2;
1148 
1149  bm::id64_t d, wd;
1150  wd = d = *digest;
1151 
1152  unsigned di = 0;
1153  if (!co1)
1154  {
1155  bm::id64_t t = d & -d;
1156 #ifdef BM64_SSE4
1157  di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1158 #else
1159  bm::id_t t32 = t & bm::id_max;
1160  if (t32 == 0) {
1161  di = 32;
1162  t32 = t >> 32;
1163  }
1164  di += unsigned(_mm_popcnt_u32(t32 - 1));
1165 #endif
1166  }
1167 
1168  for (; di < 64 ; ++di)
1169  {
1170  const unsigned d_base = di * bm::set_block_digest_wave_size;
1171  bm::id64_t dmask = (1ull << di);
1172  if (d & dmask) // digest stride NOT empty
1173  {
1174  block = (__m128i*) &wblock[d_base];
1175  mask_block = (__m128i*) &mblock[d_base];
1176  mAcc = _mm_xor_si128(mAcc, mAcc); // mAcc = 0
1177  for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1178  {
1179  __m128i m1A = _mm_load_si128(block);
1180  __m128i m2A = _mm_load_si128(block+1);
1181 
1182  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1183  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1184 
1185  co2 = _mm_extract_epi32(m1CO, 3);
1186 
1187  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1188  m2A = _mm_slli_epi32(m2A, 1);
1189 
1190  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1191  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1192 
1193  co1 = co2;
1194 
1195  co2 = _mm_extract_epi32(m2CO, 3);
1196 
1197  m2COshft = _mm_slli_si128 (m2CO, 4);
1198  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1199 
1200  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1201  m2A = _mm_or_si128(m2A, m2COshft);
1202 
1203  m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1204  m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1205 
1206  mAcc = _mm_or_si128(mAcc, m1A);
1207  mAcc = _mm_or_si128(mAcc, m2A);
1208 
1209  _mm_store_si128(block, m1A);
1210  _mm_store_si128(block+1, m2A);
1211 
1212  co1 = co2;
1213 
1214  } // for i
1215 
1216  if (_mm_testz_si128(mAcc, mAcc))
1217  d &= ~dmask; // clear digest bit
1218  wd &= wd - 1;
1219  }
1220  else
1221  {
1222  if (co1)
1223  {
1224  BM_ASSERT(co1 == 1);
1225  BM_ASSERT(wblock[d_base] == 0);
1226 
1227  bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1228  d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1229  co1 = 0;
1230  }
1231  if (!wd) // digest is empty, no CO -> exit
1232  break;
1233  }
1234  } // for di
1235 
1236  *digest = d;
1237  return co1;
1238 }
1239 
1240 
1241 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1242  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1243 
1244 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1245  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1246 
1247 #define VECT_BITCOUNT(first, last) \
1248  sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1249 
1250 #define VECT_BITCOUNT_AND(first, last, mask) \
1251  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1252 
1253 #define VECT_BITCOUNT_OR(first, last, mask) \
1254  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1255 
1256 #define VECT_BITCOUNT_XOR(first, last, mask) \
1257  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1258 
1259 #define VECT_BITCOUNT_SUB(first, last, mask) \
1260  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1261 
1262 #define VECT_INVERT_BLOCK(first) \
1263  sse2_invert_block((__m128i*)first);
1264 
1265 #define VECT_AND_BLOCK(dst, src) \
1266  sse4_and_block((__m128i*) dst, (__m128i*) (src))
1267 
1268 #define VECT_AND_DIGEST(dst, src) \
1269  sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1270 
1271 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1272  sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1273 
1274 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1275  sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1276 
1277 #define VECT_OR_BLOCK(dst, src) \
1278  sse2_or_block((__m128i*) dst, (__m128i*) (src))
1279 
1280 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1281  sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1282 
1283 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1284  sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1285 
1286 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1287  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1288 
1289 #define VECT_SUB_BLOCK(dst, src) \
1290  sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1291 
1292 #define VECT_SUB_DIGEST(dst, src) \
1293  sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1294 
1295 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1296  sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1297 
1298 #define VECT_XOR_BLOCK(dst, src) \
1299  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1300 
1301 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1302  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1303 
1304 #define VECT_COPY_BLOCK(dst, src) \
1305  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1306 
1307 #define VECT_STREAM_BLOCK(dst, src) \
1308  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1309 
1310 #define VECT_SET_BLOCK(dst, value) \
1311  sse2_set_block((__m128i*) dst, value)
1312 
1313 #define VECT_IS_ZERO_BLOCK(dst) \
1314  sse4_is_all_zero((__m128i*) dst)
1315 
1316 #define VECT_IS_ONE_BLOCK(dst) \
1317  sse4_is_all_one((__m128i*) dst)
1318 
1319 #define VECT_IS_DIGEST_ZERO(start) \
1320  sse4_is_digest_zero((__m128i*)start)
1321 
1322 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1323  sse4_block_set_digest((__m128i*)dst, val)
1324 
1325 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1326  sse4_lower_bound_scan_u32(arr, target, from, to)
1327 
1328 #define VECT_SHIFT_R1(b, acc, co) \
1329  sse42_shift_r1((__m128i*)b, acc, co)
1330 
1331 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
1332  sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1333 
1334 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1335  sse42_idx_arr_block_lookup(idx, size, nb, start)
1336 
1337 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1338  sse42_set_block_bits(block, idx, start, stop)
1339 
1340 #define VECT_BLOCK_CHANGE(block) \
1341  sse42_bit_block_calc_change((__m128i*)block)
1342 
1343 
1344 #ifdef __GNUG__
1345 #pragma GCC diagnostic pop
1346 #endif
1347 
1348 #ifdef _MSC_VER
1349 #pragma warning( pop )
1350 #endif
1351 
1352 } // namespace
1353 
1354 
1355 
1356 
1357 #endif
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 waves of pointers are all NULL
Definition: bmsse4.h:595
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse4.h:798
const unsigned set_block_size
Definition: bmconst.h:48
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse4.h:78
const unsigned set_word_shift
Definition: bmconst.h:64
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
Definition: bmsse4.h:1081
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:175
unsigned long long int id64_t
Definition: bmconst.h:32
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:332
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block)
Definition: bmsse4.h:622
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse4.h:133
Definition: bm.h:69
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if wave of 2 pointers are the same (null or FULL)
Definition: bmsse4.h:608
#define BM_ALIGN16
Definition: bmdef.h:273
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:559
Compute functions for SSE SIMD instruction set (internal)
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition: bmsse4.h:903
const unsigned id_max
Definition: bmconst.h:44
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition: bmsse4.h:126
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmsse4.h:584
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmsse4.h:866
unsigned int word_t
Definition: bmconst.h:36
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array...
Definition: bmsse4.h:767
bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse4.h:511
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse4.h:706
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition: bmsse4.h:117
unsigned short gap_word_t
Definition: bmconst.h:70
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmsse4.h:284
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
Definition: bmsse4.h:379
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmsse4.h:987
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmsse4.h:200
Definitions(internal)
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:462
#define BM_ALIGN16ATTR
Definition: bmdef.h:274
const unsigned set_block_mask
Definition: bmconst.h:50
void sse4_block_set_digest(__m128i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse4.h:219
const unsigned set_word_mask
Definition: bmconst.h:65
#define BMFORCEINLINE
Definition: bmdef.h:189
unsigned int id_t
Definition: bmconst.h:35
#define BM_ASSERT
Definition: bmdef.h:116
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition: bmsse4.h:107
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
Definition: bmsse4.h:1137
const unsigned set_block_shift
Definition: bmconst.h:49
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse4.h:237
Bit manipulation primitives (internal)
#define BMRESTRICT
Definition: bmdef.h:179
const unsigned set_block_digest_wave_size
Definition: bmconst.h:59