BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse4.h
22  \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 #include<smmintrin.h>
28 #include<nmmintrin.h>
29 #include<immintrin.h>
30 
31 #include "bmdef.h"
32 #include "bmsse_util.h"
33 #include "bmutil.h"
34 
35 namespace bm
36 {
37 
38 /** @defgroup SSE4 SSE4.2 funcions (internal)
39  Processor specific optimizations for SSE4.2 instructions (internals)
40  @internal
41  @ingroup bvector
42  */
43 
44 #ifdef __GNUG__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
47 #endif
48 
49 #ifdef _MSC_VER
50 #pragma warning( push )
51 #pragma warning( disable : 4146)
52 #endif
53 
54 
55 /*
56 inline
57 void sse2_print128(const char* prefix, const __m128i & value)
58 {
59  const size_t n = sizeof(__m128i) / sizeof(unsigned);
60  unsigned buffer[n];
61  _mm_storeu_si128((__m128i*)buffer, value);
62  std::cout << prefix << " [ ";
63  for (int i = n-1; 1; --i)
64  {
65  std::cout << buffer[i] << " ";
66  if (i == 0)
67  break;
68  }
69  std::cout << "]" << std::endl;
70 }
71 */
72 
73 /*!
74  SSE4.2 optimized bitcounting .
75  @ingroup SSE4
76 */
77 inline
78 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
79 {
80  bm::id_t count = 0;
81 #ifdef BM64_SSE4
82  const bm::id64_t* b = (bm::id64_t*) block;
83  const bm::id64_t* b_end = (bm::id64_t*) block_end;
84  do
85  {
86  count += unsigned( _mm_popcnt_u64(b[0]) +
87  _mm_popcnt_u64(b[1]));
88  b += 2;
89  } while (b < b_end);
90 #else
91  do
92  {
93  const unsigned* b = (unsigned*) block;
94  count += _mm_popcnt_u32(b[0]) +
95  _mm_popcnt_u32(b[1]) +
96  _mm_popcnt_u32(b[2]) +
97  _mm_popcnt_u32(b[3]);
98  } while (++block < block_end);
99 #endif
100  return count;
101 }
102 
103 /*!
104 \internal
105 */
107 unsigned op_xor(unsigned a, unsigned b)
108 {
109  unsigned ret = (a ^ b);
110  return ret;
111 }
112 
113 /*!
114 \internal
115 */
117 unsigned op_or(unsigned a, unsigned b)
118 {
119  return (a | b);
120 }
121 
122 /*!
123 \internal
124 */
126 unsigned op_and(unsigned a, unsigned b)
127 {
128  return (a & b);
129 }
130 
131 
132 template<class Func>
133 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
134  const __m128i* BMRESTRICT block_end,
135  const __m128i* BMRESTRICT mask_block,
136  Func sse2_func)
137 {
138  bm::id_t count = 0;
139 #ifdef BM64_SSE4
140  do
141  {
142  __m128i tmp0 = _mm_load_si128(block);
143  __m128i tmp1 = _mm_load_si128(mask_block);
144  __m128i b = sse2_func(tmp0, tmp1);
145 
146  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
148 
149  ++block; ++mask_block;
150  } while (block < block_end);
151 #else
152  do
153  {
154  __m128i tmp0 = _mm_load_si128(block);
155  __m128i tmp1 = _mm_load_si128(mask_block);
156  __m128i b = sse2_func(tmp0, tmp1);
157 
158  count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159  count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160  count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161  count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
162 
163  ++block; ++mask_block;
164  } while (block < block_end);
165 #endif
166 
167  return count;
168 }
169 
170 /*!
171  @brief check if block is all zero bits
172  @ingroup SSE4
173 */
174 inline
175 bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
176 {
177  __m128i w;
178  __m128i maskz = _mm_setzero_si128();
179  const __m128i* BMRESTRICT block_end =
180  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
181 
182  do
183  {
184  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
186  return false;
187  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
189  return false;
190  block += 4;
191  } while (block < block_end);
192  return true;
193 }
194 
195 /*!
196  @brief check if digest stride is all zero bits
197  @ingroup SSE4
198 */
199 inline
200 bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
201 {
202  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204  wA = _mm_or_si128(wA, wB);
205  bool z1 = _mm_test_all_zeros(wA, wA);
206 
207  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209  wA = _mm_or_si128(wA, wB);
210  bool z2 = _mm_test_all_zeros(wA, wA);
211  return z1 & z2;
212 }
213 
214 /*!
215  @brief set digest stride to 0xFF.. or 0x0 value
216  @ingroup SSE4
217 */
218 inline
219 void sse4_block_set_digest(__m128i* dst, unsigned value)
220 {
221  __m128i mV = _mm_set1_epi32(int(value));
222  _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223  _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224  _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225  _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
226 }
227 
228 
229 /*!
230  @brief AND blocks2
231  *dst &= *src
232 
233  @return 0 if no bits were set
234  @ingroup SSE4
235 */
236 inline
237 unsigned sse4_and_block(__m128i* BMRESTRICT dst,
238  const __m128i* BMRESTRICT src)
239 {
240  __m128i m1A, m1B, m1C, m1D;
241  __m128i accA, accB, accC, accD;
242 
243  const __m128i* BMRESTRICT src_end =
244  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
245 
246  accA = accB = accC = accD = _mm_setzero_si128();
247 
248  do
249  {
250  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
254 
255  _mm_store_si128(dst+0, m1A);
256  _mm_store_si128(dst+1, m1B);
257  _mm_store_si128(dst+2, m1C);
258  _mm_store_si128(dst+3, m1D);
259 
260  accA = _mm_or_si128(accA, m1A);
261  accB = _mm_or_si128(accB, m1B);
262  accC = _mm_or_si128(accC, m1C);
263  accD = _mm_or_si128(accD, m1D);
264 
265  src += 4; dst += 4;
266  } while (src < src_end);
267 
268  accA = _mm_or_si128(accA, accB); // A = A | B
269  accC = _mm_or_si128(accC, accD); // C = C | D
270  accA = _mm_or_si128(accA, accC); // A = A | C
271 
272  return !_mm_testz_si128(accA, accA);
273 }
274 
275 
276 /*!
277  @brief AND block digest stride
278  *dst &= *src
279 
280  @return true if stide is all zero
281  @ingroup SSE4
282 */
283 inline
284 bool sse4_and_digest(__m128i* BMRESTRICT dst,
285  const __m128i* BMRESTRICT src)
286 {
287  __m128i m1A, m1B, m1C, m1D;
288 
289  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
293 
294  _mm_store_si128(dst+0, m1A);
295  _mm_store_si128(dst+1, m1B);
296  _mm_store_si128(dst+2, m1C);
297  _mm_store_si128(dst+3, m1D);
298 
299  m1A = _mm_or_si128(m1A, m1B);
300  m1C = _mm_or_si128(m1C, m1D);
301  m1A = _mm_or_si128(m1A, m1C);
302 
303  bool z1 = _mm_testz_si128(m1A, m1A);
304 
305  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
309 
310  _mm_store_si128(dst+4, m1A);
311  _mm_store_si128(dst+5, m1B);
312  _mm_store_si128(dst+6, m1C);
313  _mm_store_si128(dst+7, m1D);
314 
315  m1A = _mm_or_si128(m1A, m1B);
316  m1C = _mm_or_si128(m1C, m1D);
317  m1A = _mm_or_si128(m1A, m1C);
318 
319  bool z2 = _mm_testz_si128(m1A, m1A);
320 
321  return z1 & z2;
322 }
323 
324 /*!
325  @brief AND block digest stride
326  *dst = *src1 & src2
327 
328  @return true if stide is all zero
329  @ingroup SSE4
330 */
331 inline
333  const __m128i* BMRESTRICT src1,
334  const __m128i* BMRESTRICT src2)
335 {
336  __m128i m1A, m1B, m1C, m1D;
337 
338  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
342 
343  _mm_store_si128(dst+0, m1A);
344  _mm_store_si128(dst+1, m1B);
345  _mm_store_si128(dst+2, m1C);
346  _mm_store_si128(dst+3, m1D);
347 
348  m1A = _mm_or_si128(m1A, m1B);
349  m1C = _mm_or_si128(m1C, m1D);
350  m1A = _mm_or_si128(m1A, m1C);
351 
352  bool z1 = _mm_testz_si128(m1A, m1A);
353 
354  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
358 
359  _mm_store_si128(dst+4, m1A);
360  _mm_store_si128(dst+5, m1B);
361  _mm_store_si128(dst+6, m1C);
362  _mm_store_si128(dst+7, m1D);
363 
364  m1A = _mm_or_si128(m1A, m1B);
365  m1C = _mm_or_si128(m1C, m1D);
366  m1A = _mm_or_si128(m1A, m1C);
367 
368  bool z2 = _mm_testz_si128(m1A, m1A);
369 
370  return z1 & z2;
371 }
372 
373 /*!
374  @brief AND block digest stride
375  @return true if stide is all zero
376  @ingroup SSE4
377 */
378 inline
380  const __m128i* BMRESTRICT src1,
381  const __m128i* BMRESTRICT src2,
382  const __m128i* BMRESTRICT src3,
383  const __m128i* BMRESTRICT src4)
384 {
385  __m128i m1A, m1B, m1C, m1D;
386  __m128i m1E, m1F, m1G, m1H;
387 
388  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
392 
393  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
397 
398  m1A = _mm_and_si128(m1A, m1E);
399  m1B = _mm_and_si128(m1B, m1F);
400  m1C = _mm_and_si128(m1C, m1G);
401  m1D = _mm_and_si128(m1D, m1H);
402 
403  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
407 
408  _mm_store_si128(dst+0, m1A);
409  _mm_store_si128(dst+1, m1B);
410  _mm_store_si128(dst+2, m1C);
411  _mm_store_si128(dst+3, m1D);
412 
413  m1A = _mm_or_si128(m1A, m1B);
414  m1C = _mm_or_si128(m1C, m1D);
415  m1A = _mm_or_si128(m1A, m1C);
416 
417  bool z1 = _mm_testz_si128(m1A, m1A);
418 
419  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
423 
424  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
428 
429  m1A = _mm_and_si128(m1A, m1E);
430  m1B = _mm_and_si128(m1B, m1F);
431  m1C = _mm_and_si128(m1C, m1G);
432  m1D = _mm_and_si128(m1D, m1H);
433 
434  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
438 
439  _mm_store_si128(dst+4, m1A);
440  _mm_store_si128(dst+5, m1B);
441  _mm_store_si128(dst+6, m1C);
442  _mm_store_si128(dst+7, m1D);
443 
444  m1A = _mm_or_si128(m1A, m1B);
445  m1C = _mm_or_si128(m1C, m1D);
446  m1A = _mm_or_si128(m1A, m1C);
447 
448  bool z2 = _mm_testz_si128(m1A, m1A);
449 
450  return z1 & z2;
451 }
452 
453 
454 /*!
455  @brief SUB (AND NOT) block digest stride
456  *dst &= ~*src
457 
458  @return true if stide is all zero
459  @ingroup SSE4
460 */
461 inline
462 bool sse4_sub_digest(__m128i* BMRESTRICT dst,
463  const __m128i* BMRESTRICT src)
464 {
465  __m128i m1A, m1B, m1C, m1D;
466 
467  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
471 
472  _mm_store_si128(dst+0, m1A);
473  _mm_store_si128(dst+1, m1B);
474  _mm_store_si128(dst+2, m1C);
475  _mm_store_si128(dst+3, m1D);
476 
477  m1A = _mm_or_si128(m1A, m1B);
478  m1C = _mm_or_si128(m1C, m1D);
479  m1A = _mm_or_si128(m1A, m1C);
480 
481  bool z1 = _mm_testz_si128(m1A, m1A);
482 
483  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
487 
488  _mm_store_si128(dst+4, m1A);
489  _mm_store_si128(dst+5, m1B);
490  _mm_store_si128(dst+6, m1C);
491  _mm_store_si128(dst+7, m1D);
492 
493  m1A = _mm_or_si128(m1A, m1B);
494  m1C = _mm_or_si128(m1C, m1D);
495  m1A = _mm_or_si128(m1A, m1C);
496 
497  bool z2 = _mm_testz_si128(m1A, m1A);
498 
499  return z1 & z2;
500 }
501 
502 
503 /*!
504  @brief 2-operand SUB (AND NOT) block digest stride
505  *dst = src1 & ~*src2
506 
507  @return true if stide is all zero
508  @ingroup SSE4
509 */
510 inline
512  const __m128i* BMRESTRICT src1,
513  const __m128i* BMRESTRICT src2)
514 {
515  __m128i m1A, m1B, m1C, m1D;
516 
517  m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518  m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519  m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520  m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
521 
522  _mm_store_si128(dst+0, m1A);
523  _mm_store_si128(dst+1, m1B);
524  _mm_store_si128(dst+2, m1C);
525  _mm_store_si128(dst+3, m1D);
526 
527  m1A = _mm_or_si128(m1A, m1B);
528  m1C = _mm_or_si128(m1C, m1D);
529  m1A = _mm_or_si128(m1A, m1C);
530 
531  bool z1 = _mm_testz_si128(m1A, m1A);
532 
533  m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534  m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535  m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536  m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
537 
538  _mm_store_si128(dst+4, m1A);
539  _mm_store_si128(dst+5, m1B);
540  _mm_store_si128(dst+6, m1C);
541  _mm_store_si128(dst+7, m1D);
542 
543  m1A = _mm_or_si128(m1A, m1B);
544  m1C = _mm_or_si128(m1C, m1D);
545  m1A = _mm_or_si128(m1A, m1C);
546 
547  bool z2 = _mm_testz_si128(m1A, m1A);
548 
549  return z1 & z2;
550 }
551 
552 
553 
554 /*!
555  @brief check if block is all zero bits
556  @ingroup SSE4
557 */
558 inline
559 bool sse4_is_all_one(const __m128i* BMRESTRICT block)
560 {
561  __m128i w;
562  const __m128i* BMRESTRICT block_end =
563  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
564 
565  do
566  {
567  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568  if (!_mm_test_all_ones(w))
569  return false;
570  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571  if (!_mm_test_all_ones(w))
572  return false;
573 
574  block+=4;
575  } while (block < block_end);
576  return true;
577 }
578 
579 /*!
580  @brief check if SSE wave is all oxFFFF...FFF
581  @ingroup SSE4
582 */
584 bool sse42_test_all_one_wave(const void* ptr)
585 {
586  return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
587 }
588 
589 
590 /*!
591  @brief check if wave of pointers is all NULL
592  @ingroup SSE4
593 */
595 bool sse42_test_all_zero_wave(const void* ptr)
596 {
597  __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
598  return _mm_testz_si128(w0, w0);
599 }
600 
601 /*!
602  @brief check if 2 waves of pointers are all NULL
603  @ingroup SSE4
604 */
606 bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1)
607 {
608  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
609  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
610  w0 = _mm_or_si128(w0, w1);
611  return _mm_testz_si128(w0, w0);
612 }
613 
614 /*!
615  @brief check if wave of 2 pointers are the same (null or FULL)
616  @ingroup SSE4
617 */
619 bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1)
620 {
621  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
622  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
623  w0 = _mm_xor_si128(w0, w1);
624  return _mm_testz_si128(w0, w0);
625 }
626 
627 
628 /*!
629  SSE4.2 calculate number of bit changes from 0 to 1
630  @ingroup SSE4
631 */
632 inline
633 unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block,
634  unsigned size)
635 {
636  const __m128i* block_end =
637  ( __m128i*)((bm::word_t*)(block) + size); // bm::set_block_size
638  __m128i m1COshft, m2COshft;
639 
640  unsigned w0 = *((bm::word_t*)(block));
641  unsigned count = 1;
642 
643  unsigned co2, co1 = 0;
644  for (;block < block_end; block += 2)
645  {
646  __m128i m1A = _mm_load_si128(block);
647  __m128i m2A = _mm_load_si128(block+1);
648 
649  __m128i m1CO = _mm_srli_epi32(m1A, 31);
650  __m128i m2CO = _mm_srli_epi32(m2A, 31);
651 
652  co2 = _mm_extract_epi32(m1CO, 3);
653 
654  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
655  __m128i m2As = _mm_slli_epi32(m2A, 1);
656 
657  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
658  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
659 
660  co1 = co2;
661 
662  co2 = _mm_extract_epi32(m2CO, 3);
663 
664  m2COshft = _mm_slli_si128 (m2CO, 4);
665  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
666 
667  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
668  m2As = _mm_or_si128(m2As, m2COshft);
669 
670  co1 = co2;
671 
672  // we now have two shifted SSE4 regs with carry-over
673  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
674  m2A = _mm_xor_si128(m2A, m2As);
675 
676 #ifdef BM64_SSE4
677  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
678  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
679  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
680 
681  m0 = _mm_extract_epi64(m2A, 0);
682  m1 = _mm_extract_epi64(m2A, 1);
683  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
684 #else
685  bm::id_t m0 = _mm_extract_epi32(m1A, 0);
686  bm::id_t m1 = _mm_extract_epi32(m1A, 1);
687  bm::id_t m2 = _mm_extract_epi32(m1A, 2);
688  bm::id_t m3 = _mm_extract_epi32(m1A, 3);
689  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
690  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
691 
692  m0 = _mm_extract_epi32(m2A, 0);
693  m1 = _mm_extract_epi32(m2A, 1);
694  m2 = _mm_extract_epi32(m2A, 2);
695  m3 = _mm_extract_epi32(m2A, 3);
696  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
697  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
698 #endif
699 
700  }
701  count -= (w0 & 1u); // correct initial carry-in error
702  return count;
703 }
704 
705 
706 /*!
707  SSE4.2 calculate number of bit changes from 0 to 1 of a XOR product
708  @ingroup SSE4
709 */
710 inline
711 unsigned sse42_bit_block_calc_xor_change(const __m128i* BMRESTRICT block,
712  const __m128i* BMRESTRICT xor_block,
713  unsigned size)
714 {
715  const __m128i* block_end =
716  ( __m128i*)((bm::word_t*)(block) + size);
717  __m128i m1COshft, m2COshft;
718 
719  unsigned w0 = *((bm::word_t*)(block));
720  unsigned count = 1;
721 
722  unsigned co2, co1 = 0;
723  for (;block < block_end; block += 2, xor_block += 2)
724  {
725  __m128i m1A = _mm_load_si128(block);
726  __m128i m2A = _mm_load_si128(block+1);
727  __m128i m1B = _mm_load_si128(xor_block);
728  __m128i m2B = _mm_load_si128(xor_block+1);
729 
730  m1A = _mm_xor_si128(m1A, m1B);
731  m2A = _mm_xor_si128(m2A, m2B);
732 
733  __m128i m1CO = _mm_srli_epi32(m1A, 31);
734  __m128i m2CO = _mm_srli_epi32(m2A, 31);
735 
736  co2 = _mm_extract_epi32(m1CO, 3);
737 
738  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
739  __m128i m2As = _mm_slli_epi32(m2A, 1);
740 
741  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
742  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
743 
744  co1 = co2;
745 
746  co2 = _mm_extract_epi32(m2CO, 3);
747 
748  m2COshft = _mm_slli_si128 (m2CO, 4);
749  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
750 
751  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
752  m2As = _mm_or_si128(m2As, m2COshft);
753 
754  co1 = co2;
755 
756  // we now have two shifted SSE4 regs with carry-over
757  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
758  m2A = _mm_xor_si128(m2A, m2As);
759 
760 #ifdef BM64_SSE4
761  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
762  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
763  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
764 
765  m0 = _mm_extract_epi64(m2A, 0);
766  m1 = _mm_extract_epi64(m2A, 1);
767  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
768 #else
769  bm::id_t m0 = _mm_extract_epi32(m1A, 0);
770  bm::id_t m1 = _mm_extract_epi32(m1A, 1);
771  bm::id_t m2 = _mm_extract_epi32(m1A, 2);
772  bm::id_t m3 = _mm_extract_epi32(m1A, 3);
773  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
774  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
775 
776  m0 = _mm_extract_epi32(m2A, 0);
777  m1 = _mm_extract_epi32(m2A, 1);
778  m2 = _mm_extract_epi32(m2A, 2);
779  m3 = _mm_extract_epi32(m2A, 3);
780  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
781  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
782 #endif
783 
784  }
785  count -= (w0 & 1u); // correct initial carry-in error
786  return count;
787 }
788 
789 
790 
791 #ifdef BM64_SSE4
792 
793 /*!
794  SSE4.2 calculate number of bit changes from 0 to 1
795  @ingroup SSE4
796 */
797 inline
798 void sse42_bit_block_calc_change_bc(const __m128i* BMRESTRICT block,
799  unsigned* gc, unsigned* bc)
800 {
801  const __m128i* block_end =
802  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
803  __m128i m1COshft, m2COshft;
804 
805  unsigned w0 = *((bm::word_t*)(block));
806  unsigned bit_count = 0;
807  unsigned gap_count = 1;
808 
809  unsigned co2, co1 = 0;
810  for (;block < block_end; block += 2)
811  {
812  __m128i m1A = _mm_load_si128(block);
813  __m128i m2A = _mm_load_si128(block+1);
814  {
815  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
816  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
817  bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
818  m0 = _mm_extract_epi64(m2A, 0);
819  m1 = _mm_extract_epi64(m2A, 1);
820  bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
821  }
822 
823  __m128i m1CO = _mm_srli_epi32(m1A, 31);
824  __m128i m2CO = _mm_srli_epi32(m2A, 31);
825 
826  co2 = _mm_extract_epi32(m1CO, 3);
827 
828  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
829  __m128i m2As = _mm_slli_epi32(m2A, 1);
830 
831  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
832  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
833 
834  co1 = co2;
835 
836  co2 = _mm_extract_epi32(m2CO, 3);
837 
838  m2COshft = _mm_slli_si128 (m2CO, 4);
839  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
840 
841  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
842  m2As = _mm_or_si128(m2As, m2COshft);
843 
844  co1 = co2;
845 
846  // we now have two shifted SSE4 regs with carry-over
847  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
848  m2A = _mm_xor_si128(m2A, m2As);
849  {
850  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
851  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
852  gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
853  }
854 
855  bm::id64_t m0 = _mm_extract_epi64(m2A, 0);
856  bm::id64_t m1 = _mm_extract_epi64(m2A, 1);
857  gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
858 
859  }
860  gap_count -= (w0 & 1u); // correct initial carry-in error
861  *gc = gap_count;
862  *bc = bit_count;
863 }
864 
865 #endif
866 
867 
868 /*!
869  \brief Find first bit which is different between two bit-blocks
870  @ingroup AVX2
871 */
872 inline
873 bool sse42_bit_find_first_diff(const __m128i* BMRESTRICT block1,
874  const __m128i* BMRESTRICT block2,
875  unsigned* pos)
876 {
877  unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
878 
879  const __m128i* block1_end =
880  (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
881  __m128i maskZ = _mm_setzero_si128();
882  __m128i mA, mB;
883  unsigned simd_lane = 0;
884  do
885  {
886  mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
887  mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
888  __m128i mOR = _mm_or_si128(mA, mB);
889  if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
890  {
891  if (!_mm_test_all_zeros(mA, mA))
892  {
893  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
894  mask = ~mask; // invert to find (w != 0)
895  BM_ASSERT(mask);
896  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
897  _mm_store_si128 ((__m128i*)simd_buf, mA);
898  unsigned widx = bsf >> 2; // (bsf / 4);
899  unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
900  bsf = bm::bsf_asm32(w); // find first bit != 0
901  *pos = (simd_lane * 128) + (widx * 32) + bsf;
902  return true;
903  }
904  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
905  mask = ~mask; // invert to find (w != 0)
906  BM_ASSERT(mask);
907  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
908  _mm_store_si128 ((__m128i*)simd_buf, mB);
909  unsigned widx = bsf >> 2; // (bsf / 4);
910  unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
911  bsf = bm::bsf_asm32(w); // find first bit != 0
912  *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
913  return true;
914  }
915 
916  simd_lane+=2;
917  block1+=2; block2+=2;
918 
919  } while (block1 < block1_end);
920  return false;
921 }
922 
923 
924 /*!
925  \brief Find first non-zero bit
926  @ingroup AVX2
927 */
928 inline
929 bool sse42_bit_find_first(const __m128i* BMRESTRICT block,
930  unsigned* pos)
931 {
932  unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
933 
934  const __m128i* block_end =
935  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
936  __m128i maskZ = _mm_setzero_si128();
937  __m128i mA, mB;
938  unsigned simd_lane = 0;
939  do
940  {
941  mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
942  __m128i mOR = _mm_or_si128(mA, mB);
943  if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
944  {
945  if (!_mm_test_all_zeros(mA, mA))
946  {
947  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
948  mask = ~mask; // invert to find (w != 0)
949  BM_ASSERT(mask);
950  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
951  _mm_store_si128 ((__m128i*)simd_buf, mA);
952  unsigned widx = bsf >> 2; // (bsf / 4);
953  unsigned w = simd_buf[widx];
954  bsf = bm::bsf_asm32(w); // find first bit != 0
955  *pos = (simd_lane * 128) + (widx * 32) + bsf;
956  return true;
957  }
958  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
959  mask = ~mask; // invert to find (w != 0)
960  BM_ASSERT(mask);
961  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
962  _mm_store_si128 ((__m128i*)simd_buf, mB);
963  unsigned widx = bsf >> 2; // (bsf / 4);
964  unsigned w = simd_buf[widx];
965  bsf = bm::bsf_asm32(w); // find first bit != 0
966  *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
967  return true;
968  }
969 
970  simd_lane+=2;
971  block+=2;
972 
973  } while (block < block_end);
974  return false;
975 }
976 
977 
978 
979 
980 #ifdef __GNUG__
981 // necessary measure to silence false warning from GCC about negative pointer arithmetics
982 #pragma GCC diagnostic push
983 #pragma GCC diagnostic ignored "-Warray-bounds"
984 #endif
985 
986 /*!
987  SSE4.2 check for one to two (variable len) 128 bit SSE lines
988  for gap search results (8 elements)
989  @ingroup SSE4
990  \internal
991 */
992 inline
994  const bm::gap_word_t pos, const unsigned size)
995 {
996  BM_ASSERT(size <= 16);
997  BM_ASSERT(size);
998 
999  const unsigned unroll_factor = 8;
1000  if (size < 4) // for very short vector use conventional scan
1001  {
1002  unsigned j;
1003  for (j = 0; j < size; ++j)
1004  {
1005  if (pbuf[j] >= pos)
1006  break;
1007  }
1008  return j;
1009  }
1010 
1011  __m128i m1, mz, maskF, maskFL;
1012 
1013  mz = _mm_setzero_si128();
1014  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1015 
1016  maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
1017  maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
1018  int shiftL= (64 - (unroll_factor - size) * 16);
1019  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1020 
1021  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1022  m1 = _mm_or_si128(m1, maskFL);
1023 
1024  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1025  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1026  __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
1027  int mi = _mm_movemask_epi8(c_mask); // collect flag bits
1028  if (mi)
1029  {
1030  // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
1031  unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
1032  return unroll_factor - bc; // address of first one element (target)
1033  }
1034  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1035  // GCC gives a false warning for "- unroll_factor" here
1036  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1037  BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
1038 
1039  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1040  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1041  mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1042  unsigned bc = _mm_popcnt_u32(mi);
1043 
1044  return size - bc;
1045 }
1046 
1047 /**
1048  Hybrid binary search, starts as binary, then switches to linear scan
1049 
1050  \param buf - GAP buffer pointer.
1051  \param pos - index of the element.
1052  \param is_set - output. GAP value (0 or 1).
1053  \return GAP index.
1054 
1055  @ingroup SSE4
1056 */
1057 inline
1058 unsigned sse42_gap_bfind(const unsigned short* BMRESTRICT buf,
1059  unsigned pos, unsigned* BMRESTRICT is_set)
1060 {
1061  unsigned start = 1;
1062  unsigned end = 1 + ((*buf) >> 3);
1063  unsigned dsize = end - start;
1064 
1065  if (dsize < 17)
1066  {
1067  start = bm::sse4_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
1068  *is_set = ((*buf) & 1) ^ (start & 1);
1069  BM_ASSERT(buf[start+1] >= pos);
1070  BM_ASSERT(buf[start] < pos || (start==0));
1071 
1072  return start+1;
1073  }
1074  unsigned arr_end = end;
1075  while (start != end)
1076  {
1077  unsigned curr = (start + end) >> 1;
1078  if (buf[curr] < pos)
1079  start = curr + 1;
1080  else
1081  end = curr;
1082 
1083  unsigned size = end - start;
1084  if (size < 16)
1085  {
1086  size += (end != arr_end);
1087  unsigned idx =
1088  bm::sse4_gap_find(buf + start, (bm::gap_word_t)pos, size);
1089  start += idx;
1090 
1091  BM_ASSERT(buf[start] >= pos);
1092  BM_ASSERT(buf[start - 1] < pos || (start == 1));
1093  break;
1094  }
1095  }
1096 
1097  *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1098  return start;
1099 }
1100 
1101 /**
1102  Hybrid binary search, starts as binary, then switches to scan
1103  @ingroup SSE4
1104 */
1105 inline
1106 unsigned sse42_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
1107 {
1108  unsigned is_set;
1109  bm::sse42_gap_bfind(buf, pos, &is_set);
1110  return is_set;
1111 }
1112 
1113 
1114 
1115 /**
1116  Experimental (test) function to do SIMD vector search (lower bound)
1117  in sorted, growing array
1118  @ingroup SSE4
1119 
1120  \internal
1121 */
1122 inline
1123 int sse42_cmpge_u32(__m128i vect4, unsigned value)
1124 {
1125  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1126  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1127  //
1128  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1129  __m128i mm_val = _mm_set1_epi32(value);
1130 
1131  __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
1132  __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
1133 
1134  __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1135  __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1136 
1137  __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1138  int mask = _mm_movemask_epi8(cmp_mask_ge);
1139  if (mask)
1140  {
1141  int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
1142  return bsf / 4;
1143  }
1144  return -1;
1145 }
1146 
1147 
1148 /**
1149  lower bound (great or equal) linear scan in ascending order sorted array
1150  @ingroup SSE4
1151  \internal
1152 */
1153 inline
1154 unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
1155  unsigned target,
1156  unsigned from,
1157  unsigned to)
1158 {
1159  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1160  // see more at:
1161  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1162 
1163  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
1164 
1165  unsigned unroll_factor = 8;
1166  unsigned len = to - from + 1;
1167  unsigned len_unr = len - (len % unroll_factor);
1168 
1169  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1170  __m128i vect_target = _mm_set1_epi32(target);
1171  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
1172 
1173  int mask;
1174  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1175 
1176  unsigned k = 0;
1177  for (; k < len_unr; k+=unroll_factor)
1178  {
1179  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
1180  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
1181 
1182  cmp_mask_ge = _mm_or_si128( // GT | EQ
1183  _mm_cmpgt_epi32 (norm_vect40, norm_target),
1184  _mm_cmpeq_epi32 (vect40, vect_target)
1185  );
1186  mask = _mm_movemask_epi8(cmp_mask_ge);
1187  if (mask)
1188  {
1189  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
1190  return from + k + (bsf / 4);
1191  }
1192  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
1193  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
1194 
1195  cmp_mask_ge = _mm_or_si128(
1196  _mm_cmpgt_epi32 (norm_vect41, norm_target),
1197  _mm_cmpeq_epi32 (vect41, vect_target)
1198  );
1199  mask = _mm_movemask_epi8(cmp_mask_ge);
1200  if (mask)
1201  {
1202  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
1203  return 4 + from + k + (bsf / 4);
1204  }
1205  } // for
1206 
1207  for (; k < len; ++k)
1208  {
1209  if (arr_base[k] >= target)
1210  return from + k;
1211  }
1212  return to + 1;
1213 }
1214 
1215 
1216 
1217 /*!
1218  SSE4.2 index lookup to check what belongs to the same block (8 elements)
1219  \internal
1220 */
1221 inline
1222 unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
1223  unsigned nb, unsigned start)
1224 {
1225  const unsigned unroll_factor = 8;
1226  const unsigned len = (size - start);
1227  const unsigned len_unr = len - (len % unroll_factor);
1228  unsigned k;
1229 
1230  idx += start;
1231 
1232  __m128i nbM = _mm_set1_epi32(nb);
1233 
1234  for (k = 0; k < len_unr; k+=unroll_factor)
1235  {
1236  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1237  __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1238  __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
1239  __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
1240 
1241  if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1242  !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1243  break;
1244 
1245  } // for k
1246  for (; k < len; ++k)
1247  {
1248  if (nb != unsigned(idx[k] >> bm::set_block_shift))
1249  break;
1250  }
1251  return start + k;
1252 }
1253 
1254 /*!
1255  SSE4.2 bulk bit set
1256  \internal
1257 */
1258 inline
1260  const unsigned* BMRESTRICT idx,
1261  unsigned start, unsigned stop )
1262 {
1263  const unsigned unroll_factor = 4;
1264  const unsigned len = (stop - start);
1265  const unsigned len_unr = len - (len % unroll_factor);
1266 
1267  idx += start;
1268 
1269  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1270  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1271 
1272  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1273  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1274 
1275  unsigned k = 0;
1276  for (; k < len_unr; k+=unroll_factor)
1277  {
1278  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1279  __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
1280  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1281 
1282 
1283  nbitA = _mm_and_si128 (nbitA, sw_mask);
1284  _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1285 
1286  // check-compare if all 4 bits are in the very same word
1287  //
1288  __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
1289  __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
1290  if (_mm_test_all_ones(cmpA)) // check if all are in one word
1291  {
1292  unsigned nword = _mm_extract_epi32(nwordA, 0);
1293  block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1294  |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1295  }
1296  else // bits are in different words, use scalar scatter
1297  {
1298  _mm_store_si128 ((__m128i*)mword_v, nwordA);
1299 
1300  block[mword_v[0]] |= (1u << mshift_v[0]);
1301  block[mword_v[1]] |= (1u << mshift_v[1]);
1302  block[mword_v[2]] |= (1u << mshift_v[2]);
1303  block[mword_v[3]] |= (1u << mshift_v[3]);
1304  }
1305 
1306  } // for k
1307 
1308  for (; k < len; ++k)
1309  {
1310  unsigned n = idx[k];
1311  unsigned nbit = unsigned(n & bm::set_block_mask);
1312  unsigned nword = nbit >> bm::set_word_shift;
1313  nbit &= bm::set_word_mask;
1314  block[nword] |= (1u << nbit);
1315  } // for k
1316 }
1317 
1318 
1319 /*!
1320  SSE4.2 bit block gather-scatter
1321 
1322  @param arr - destination array to set bits
1323  @param blk - source bit-block
1324  @param idx - gather index array
1325  @param size - gather array size
1326  @param start - gaher start index
1327  @param bit_idx - bit to set in the target array
1328 
1329  \internal
1330 
1331  C algorithm:
1332 
1333  for (unsigned k = start; k < size; ++k)
1334  {
1335  nbit = unsigned(idx[k] & bm::set_block_mask);
1336  nword = unsigned(nbit >> bm::set_word_shift);
1337  mask0 = 1u << (nbit & bm::set_word_mask);
1338  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
1339  }
1340 
1341 */
1342 inline
1344  const unsigned* BMRESTRICT blk,
1345  const unsigned* BMRESTRICT idx,
1346  unsigned size,
1347  unsigned start,
1348  unsigned bit_idx)
1349 {
1350  const unsigned unroll_factor = 4;
1351  const unsigned len = (size - start);
1352  const unsigned len_unr = len - (len % unroll_factor);
1353 
1354  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1355  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1356  __m128i maskFF = _mm_set1_epi32(~0u);
1357  __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1358 
1359  __m128i mask_tmp, mask_0;
1360 
1361  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1362  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1363 
1364  unsigned k = 0;
1365  unsigned base = start + k;
1366  __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
1367  __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
1368  for (; k < len_unr; k+=unroll_factor)
1369  {
1370  __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
1371  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1372  // (nbit & bm::set_word_mask)
1373  _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1374  _mm_store_si128 ((__m128i*)mword_v, nwordA);
1375 
1376  // mask0 = 1u << (nbit & bm::set_word_mask);
1377  //
1378 #if 0
1379  // ifdefed an alternative SHIFT implementation using SSE and masks
1380  // (it is not faster than just doing scalar operations)
1381  {
1382  __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1383  __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1384  mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1385  mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1386  mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1387 
1388  __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1389  _mm_slli_si128 (am_0, 8));
1390  mask_tmp = _mm_and_si128 (
1391  _mm_slli_epi32(mask1, mshift_v[3]),
1392  _mm_slli_si128 (am_0, 12)
1393  );
1394 
1395  mask_0 = _mm_or_si128 (mask_0,
1396  _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1397  }
1398 #endif
1399  mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1400 
1401 
1402  // gather for: blk[nword] (.. & mask0 )
1403  //
1404  mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1405  blk[mword_v[1]], blk[mword_v[0]]),
1406  mask_0);
1407 
1408  // bool(blk[nword] ...)
1409  //maskFF = _mm_set1_epi32(~0u);
1410  mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1411  mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1412  mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1413 
1414  mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1415 
1416  _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1417  _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1418 
1419  ++idx_ptr; ++target_ptr;
1420  _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1421  }
1422 
1423  for (; k < len; ++k)
1424  {
1425  base = start + k;
1426  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1427  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1428  }
1429 
1430 }
1431 
1432 /*!
1433  @brief block shift left by 1
1434  @ingroup SSE4
1435 */
1436 inline
1437 bool sse42_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1)
1438 {
1439  __m128i* block_end =
1440  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1441  __m128i mAcc = _mm_set1_epi32(0);
1442  __m128i mMask1 = _mm_set1_epi32(1);
1443 
1444  unsigned co2;
1445  for (--block_end; block_end >= block; block_end -= 2)
1446  {
1447  __m128i m1A = _mm_load_si128(block_end);
1448  __m128i m2A = _mm_load_si128(block_end-1);
1449 
1450  __m128i m1CO = _mm_and_si128(m1A, mMask1);
1451  __m128i m2CO = _mm_and_si128(m2A, mMask1);
1452 
1453  co2 = _mm_extract_epi32(m1CO, 0);
1454 
1455  m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1456  m2A = _mm_srli_epi32(m2A, 1);
1457 
1458  __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1459  __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1460  m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1461  m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1462  m1COshft = _mm_slli_epi32(m1COshft, 31);
1463  m2COshft = _mm_slli_epi32(m2COshft, 31);
1464 
1465  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1466  m2A = _mm_or_si128(m2A, m2COshft);
1467 
1468  co1 = _mm_extract_epi32(m2CO, 0);
1469 
1470  _mm_store_si128(block_end, m1A);
1471  _mm_store_si128(block_end-1, m2A);
1472 
1473  mAcc = _mm_or_si128(mAcc, m1A);
1474  mAcc = _mm_or_si128(mAcc, m2A);
1475  } // for
1476 
1477  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1478  return co1;
1479 }
1480 
1481 
1482 /*!
1483  @brief block shift right by 1
1484  @ingroup SSE4
1485 */
1486 inline
1487 bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1)
1488 {
1489  __m128i* block_end =
1490  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1491  __m128i m1COshft, m2COshft;
1492  __m128i mAcc = _mm_set1_epi32(0);
1493 
1494  unsigned co2;
1495  for (;block < block_end; block += 2)
1496  {
1497  __m128i m1A = _mm_load_si128(block);
1498  __m128i m2A = _mm_load_si128(block+1);
1499 
1500  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1501  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1502 
1503  co2 = _mm_extract_epi32(m1CO, 3);
1504 
1505  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1506  m2A = _mm_slli_epi32(m2A, 1);
1507 
1508  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1509  m2COshft = _mm_slli_si128 (m2CO, 4);
1510  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1511  m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1512 
1513  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1514  m2A = _mm_or_si128(m2A, m2COshft);
1515 
1516  co1 = _mm_extract_epi32(m2CO, 3);
1517 
1518  _mm_store_si128(block, m1A);
1519  _mm_store_si128(block+1, m2A);
1520 
1521  mAcc = _mm_or_si128(mAcc, m1A);
1522  mAcc = _mm_or_si128(mAcc, m2A);
1523  }
1524  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1525  return co1;
1526 }
1527 
1528 
1529 
1530 /*!
1531  @brief block shift right by 1 plus AND
1532 
1533  @return carry over flag
1534  @ingroup SSE4
1535 */
1536 inline
1537 bool sse42_shift_r1_and(__m128i* block,
1538  bm::word_t co1,
1539  const __m128i* BMRESTRICT mask_block,
1540  bm::id64_t* digest)
1541 {
1542  bm::word_t* wblock = (bm::word_t*) block;
1543  const bm::word_t* mblock = (const bm::word_t*) mask_block;
1544 
1545  __m128i m1COshft, m2COshft;
1546  __m128i mAcc = _mm_set1_epi32(0);
1547  unsigned co2;
1548 
1549  bm::id64_t d, wd;
1550  wd = d = *digest;
1551 
1552  unsigned di = 0;
1553  if (!co1)
1554  {
1555  bm::id64_t t = d & -d;
1556 #ifdef BM64_SSE4
1557  di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1558 #else
1559  bm::id_t t32 = t & bm::id_max;
1560  if (t32 == 0) {
1561  di = 32;
1562  t32 = t >> 32;
1563  }
1564  di += unsigned(_mm_popcnt_u32(t32 - 1));
1565 #endif
1566  }
1567 
1568  for (; di < 64 ; ++di)
1569  {
1570  const unsigned d_base = di * bm::set_block_digest_wave_size;
1571  bm::id64_t dmask = (1ull << di);
1572  if (d & dmask) // digest stride NOT empty
1573  {
1574  block = (__m128i*) &wblock[d_base];
1575  mask_block = (__m128i*) &mblock[d_base];
1576  mAcc = _mm_xor_si128(mAcc, mAcc); // mAcc = 0
1577  for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1578  {
1579  __m128i m1A = _mm_load_si128(block);
1580  __m128i m2A = _mm_load_si128(block+1);
1581 
1582  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1583  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1584 
1585  co2 = _mm_extract_epi32(m1CO, 3);
1586 
1587  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1588  m2A = _mm_slli_epi32(m2A, 1);
1589 
1590  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1591  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1592 
1593  co1 = co2;
1594 
1595  co2 = _mm_extract_epi32(m2CO, 3);
1596 
1597  m2COshft = _mm_slli_si128 (m2CO, 4);
1598  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1599 
1600  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1601  m2A = _mm_or_si128(m2A, m2COshft);
1602 
1603  m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1604  m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1605 
1606  mAcc = _mm_or_si128(mAcc, m1A);
1607  mAcc = _mm_or_si128(mAcc, m2A);
1608 
1609  _mm_store_si128(block, m1A);
1610  _mm_store_si128(block+1, m2A);
1611 
1612  co1 = co2;
1613 
1614  } // for i
1615 
1616  if (_mm_testz_si128(mAcc, mAcc))
1617  d &= ~dmask; // clear digest bit
1618  wd &= wd - 1;
1619  }
1620  else
1621  {
1622  if (co1)
1623  {
1624  BM_ASSERT(co1 == 1);
1625  BM_ASSERT(wblock[d_base] == 0);
1626 
1627  bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1628  d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1629  co1 = 0;
1630  }
1631  if (!wd) // digest is empty, no CO -> exit
1632  break;
1633  }
1634  } // for di
1635 
1636  *digest = d;
1637  return co1;
1638 }
1639 
1640 /**
1641  Build partial XOR product of 2 bit-blocks using digest mask
1642 
1643  @param target_block - target := block ^ xor_block
1644  @param block - arg1
1645  @param xor_block - arg2
1646  @param digest - mask for each block wave to XOR (1) or just copy (0)
1647 
1648  @ingroup SSE4
1649  @internal
1650 */
1651 inline
1652 void sse42_bit_block_xor(bm::word_t* target_block,
1653  const bm::word_t* block, const bm::word_t* xor_block,
1654  bm::id64_t digest)
1655 {
1656  for (unsigned i = 0; i < bm::block_waves; ++i)
1657  {
1658  const bm::id64_t mask = (1ull << i);
1659  unsigned off = (i * bm::set_block_digest_wave_size);
1660  const __m128i* sub_block = (__m128i*) (block + off);
1661  __m128i* t_sub_block = (__m128i*)(target_block + off);
1662 
1663  if (digest & mask) // XOR filtered sub-block
1664  {
1665  const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
1666  __m128i mA, mB, mC, mD;
1667  mA = _mm_xor_si128(_mm_load_si128(sub_block),
1668  _mm_load_si128(xor_sub_block));
1669  mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1670  _mm_load_si128(xor_sub_block+1));
1671  mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1672  _mm_load_si128(xor_sub_block+2));
1673  mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1674  _mm_load_si128(xor_sub_block+3));
1675 
1676  _mm_store_si128(t_sub_block, mA);
1677  _mm_store_si128(t_sub_block+1, mB);
1678  _mm_store_si128(t_sub_block+2, mC);
1679  _mm_store_si128(t_sub_block+3, mD);
1680 
1681  mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1682  _mm_load_si128(xor_sub_block+4));
1683  mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1684  _mm_load_si128(xor_sub_block+5));
1685  mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1686  _mm_load_si128(xor_sub_block+6));
1687  mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1688  _mm_load_si128(xor_sub_block+7));
1689 
1690  _mm_store_si128(t_sub_block+4, mA);
1691  _mm_store_si128(t_sub_block+5, mB);
1692  _mm_store_si128(t_sub_block+6, mC);
1693  _mm_store_si128(t_sub_block+7, mD);
1694 
1695  }
1696  else // just copy source
1697  {
1698  _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
1699  _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
1700  _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
1701  _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
1702 
1703  _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
1704  _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
1705  _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
1706  _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
1707  }
1708  } // for i
1709 }
1710 
1711 
1712 
1713 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1714  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1715 
1716 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1717  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1718 
1719 #define VECT_BITCOUNT(first, last) \
1720  sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1721 
1722 #define VECT_BITCOUNT_AND(first, last, mask) \
1723  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1724 
1725 #define VECT_BITCOUNT_OR(first, last, mask) \
1726  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1727 
1728 #define VECT_BITCOUNT_XOR(first, last, mask) \
1729  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1730 
1731 #define VECT_BITCOUNT_SUB(first, last, mask) \
1732  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1733 
1734 #define VECT_INVERT_BLOCK(first) \
1735  sse2_invert_block((__m128i*)first);
1736 
1737 #define VECT_AND_BLOCK(dst, src) \
1738  sse4_and_block((__m128i*) dst, (__m128i*) (src))
1739 
1740 #define VECT_AND_DIGEST(dst, src) \
1741  sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1742 
1743 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1744  sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1745 
1746 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1747  sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1748 
1749 #define VECT_OR_BLOCK(dst, src) \
1750  sse2_or_block((__m128i*) dst, (__m128i*) (src))
1751 
1752 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1753  sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1754 
1755 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1756  sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1757 
1758 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1759  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1760 
1761 #define VECT_SUB_BLOCK(dst, src) \
1762  sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1763 
1764 #define VECT_SUB_DIGEST(dst, src) \
1765  sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1766 
1767 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1768  sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1769 
1770 #define VECT_XOR_BLOCK(dst, src) \
1771  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1772 
1773 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1774  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1775 
1776 #define VECT_COPY_BLOCK(dst, src) \
1777  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1778 
1779 #define VECT_STREAM_BLOCK(dst, src) \
1780  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1781 
1782 #define VECT_SET_BLOCK(dst, value) \
1783  sse2_set_block((__m128i*) dst, value)
1784 
1785 #define VECT_IS_ZERO_BLOCK(dst) \
1786  sse4_is_all_zero((__m128i*) dst)
1787 
1788 #define VECT_IS_ONE_BLOCK(dst) \
1789  sse4_is_all_one((__m128i*) dst)
1790 
1791 #define VECT_IS_DIGEST_ZERO(start) \
1792  sse4_is_digest_zero((__m128i*)start)
1793 
1794 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1795  sse4_block_set_digest((__m128i*)dst, val)
1796 
1797 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1798  sse4_lower_bound_scan_u32(arr, target, from, to)
1799 
1800 #define VECT_SHIFT_L1(b, acc, co) \
1801  sse42_shift_l1((__m128i*)b, acc, co)
1802 
1803 #define VECT_SHIFT_R1(b, acc, co) \
1804  sse42_shift_r1((__m128i*)b, acc, co)
1805 
1806 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
1807  sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1808 
1809 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1810  sse42_idx_arr_block_lookup(idx, size, nb, start)
1811 
1812 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1813  sse42_set_block_bits(block, idx, start, stop)
1814 
1815 #define VECT_BLOCK_CHANGE(block, size) \
1816  sse42_bit_block_calc_change((__m128i*)block, size)
1817 
1818 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size) \
1819  sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size)
1820 
1821 #ifdef BM64_SSE4
1822 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
1823  sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
1824 #endif
1825 
1826 #define VECT_BIT_FIND_FIRST(src, pos) \
1827  sse42_bit_find_first((__m128i*) src, pos)
1828 
1829 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1830  sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1831 
1832 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1833  sse42_bit_block_xor(t, src, src_xor, d)
1834 
1835 #define VECT_GAP_BFIND(buf, pos, is_set) \
1836  sse42_gap_bfind(buf, pos, is_set)
1837 
1838 #ifdef __GNUG__
1839 #pragma GCC diagnostic pop
1840 #endif
1841 
1842 #ifdef _MSC_VER
1843 #pragma warning( pop )
1844 #endif
1845 
1846 } // namespace
1847 
1848 
1849 
1850 
1851 #endif
bm::sse42_shift_l1
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift left by 1
Definition: bmsse4.h:1437
BM_ALIGN16ATTR
#define BM_ALIGN16ATTR
Definition: bmdef.h:277
bm::sse42_gap_test
unsigned sse42_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmsse4.h:1106
bm::sse4_is_digest_zero
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmsse4.h:200
bm::set_block_size
const unsigned set_block_size
Definition: bmconst.h:54
bm::op_xor
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition: bmsse4.h:107
bm::sse4_and_block
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse4.h:237
bm::sse4_bit_count_op
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse4.h:133
bm::op_and
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition: bmsse4.h:126
bm::id64_t
unsigned long long int id64_t
Definition: bmconst.h:34
bm::sse42_bit_find_first_diff
bool sse42_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
Definition: bmsse4.h:873
bm::set_word_shift
const unsigned set_word_shift
Definition: bmconst.h:71
bm::sse42_bit_block_calc_change
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block, unsigned size)
Definition: bmsse4.h:633
bm::sse42_idx_arr_block_lookup
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmsse4.h:1222
bm::sse4_block_set_digest
void sse4_block_set_digest(__m128i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse4.h:219
bm::sse42_bit_find_first
bool sse42_bit_find_first(const __m128i *BMRESTRICT block, unsigned *pos)
Find first non-zero bit.
Definition: bmsse4.h:929
bm::sse4_is_all_one
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:559
bm::sse4_lower_bound_scan_u32
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse4.h:1154
BM_ALIGN32ATTR
#define BM_ALIGN32ATTR
Definition: bmdef.h:282
bm::sse42_cmpge_u32
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition: bmsse4.h:1123
bm::sse4_bit_count
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse4.h:78
bm::sse4_bit_block_gather_scatter
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmsse4.h:1343
bm::sse4_and_digest_5way
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
Definition: bmsse4.h:379
bm::id_max
const unsigned id_max
Definition: bmconst.h:108
bm::set_block_mask
const unsigned set_block_mask
Definition: bmconst.h:56
bm::op_or
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition: bmsse4.h:117
BM_ALIGN32
#define BM_ALIGN32
Definition: bmdef.h:281
bm::sse4_sub_digest_2way
bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse4.h:511
bm::set_block_digest_wave_size
const unsigned set_block_digest_wave_size
Definition: bmconst.h:66
bm::sse42_bit_block_calc_change_bc
void sse42_bit_block_calc_change_bc(const __m128i *BMRESTRICT block, unsigned *gc, unsigned *bc)
Definition: bmsse4.h:798
bmsse_util.h
Compute functions for SSE SIMD instruction set (internal)
bm::sse42_gap_bfind
unsigned sse42_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition: bmsse4.h:1058
bm::gap_word_t
unsigned short gap_word_t
Definition: bmconst.h:77
bm::block_waves
const unsigned block_waves
Definition: bmconst.h:65
BM_ASSERT
#define BM_ASSERT
Definition: bmdef.h:130
bm::id_t
unsigned int id_t
Definition: bmconst.h:37
bm::sse4_and_digest_2way
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:332
bm::sse4_gap_find
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse4.h:993
bmdef.h
Definitions(internal)
bm::sse42_bit_block_xor
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse4.h:1652
bmutil.h
Bit manipulation primitives (internal)
bm::set_block_shift
const unsigned set_block_shift
Definition: bmconst.h:55
bm::sse4_sub_digest
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:462
bm::sse42_shift_r1_and
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
Definition: bmsse4.h:1537
bm::sse4_is_all_zero
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:175
BMFORCEINLINE
#define BMFORCEINLINE
Definition: bmdef.h:203
bm
Definition: bm.h:76
bm::sse42_test_all_zero_wave2
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 waves of pointers are all NULL
Definition: bmsse4.h:606
bm::sse42_shift_r1
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
Definition: bmsse4.h:1487
bm::set_word_mask
const unsigned set_word_mask
Definition: bmconst.h:72
bm::word_t
unsigned int word_t
Definition: bmconst.h:38
BMRESTRICT
#define BMRESTRICT
Definition: bmdef.h:193
bm::sse4_and_digest
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmsse4.h:284
bm::sse42_bit_block_calc_xor_change
unsigned sse42_bit_block_calc_xor_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT xor_block, unsigned size)
Definition: bmsse4.h:711
BM_ALIGN16
#define BM_ALIGN16
Definition: bmdef.h:276
bm::sse42_test_all_one_wave
BMFORCEINLINE bool sse42_test_all_one_wave(const void *ptr)
check if SSE wave is all oxFFFF...FFF
Definition: bmsse4.h:584
bm::sse42_test_all_eq_wave2
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if wave of 2 pointers are the same (null or FULL)
Definition: bmsse4.h:619
bm::sse42_test_all_zero_wave
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmsse4.h:595
bm::sse42_set_block_bits
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition: bmsse4.h:1259