BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse4.h
22  \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 #include<smmintrin.h>
28 #include<nmmintrin.h>
29 #include<immintrin.h>
30 
31 #include "bmdef.h"
32 #include "bmsse_util.h"
33 #include "bmutil.h"
34 
35 namespace bm
36 {
37 
38 /** @defgroup SSE4 SSE4.2 funcions (internal)
39  Processor specific optimizations for SSE4.2 instructions (internals)
40  @internal
41  @ingroup bvector
42  */
43 
44 #ifdef __GNUG__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
47 #endif
48 
49 /*
50 inline
51 void sse2_print128(const char* prefix, const __m128i & value)
52 {
53  const size_t n = sizeof(__m128i) / sizeof(unsigned);
54  unsigned buffer[n];
55  _mm_storeu_si128((__m128i*)buffer, value);
56  std::cout << prefix << " [ ";
57  for (int i = n-1; 1; --i)
58  {
59  std::cout << buffer[i] << " ";
60  if (i == 0)
61  break;
62  }
63  std::cout << "]" << std::endl;
64 }
65 */
66 
67 /*!
68  SSE4.2 optimized bitcounting .
69  @ingroup SSE4
70 */
71 inline
72 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
73 {
74  bm::id_t count = 0;
75 #ifdef BM64_SSE4
76  const bm::id64_t* b = (bm::id64_t*) block;
77  const bm::id64_t* b_end = (bm::id64_t*) block_end;
78  do
79  {
80  count += unsigned( _mm_popcnt_u64(b[0]) +
81  _mm_popcnt_u64(b[1]));
82  b += 2;
83  } while (b < b_end);
84 #else
85  do
86  {
87  const unsigned* b = (unsigned*) block;
88  count += _mm_popcnt_u32(b[0]) +
89  _mm_popcnt_u32(b[1]) +
90  _mm_popcnt_u32(b[2]) +
91  _mm_popcnt_u32(b[3]);
92  } while (++block < block_end);
93 #endif
94  return count;
95 }
96 
97 /*!
98 \internal
99 */
101 unsigned op_xor(unsigned a, unsigned b)
102 {
103  unsigned ret = (a ^ b);
104  return ret;
105 }
106 
107 /*!
108 \internal
109 */
111 unsigned op_or(unsigned a, unsigned b)
112 {
113  return (a | b);
114 }
115 
116 /*!
117 \internal
118 */
120 unsigned op_and(unsigned a, unsigned b)
121 {
122  return (a & b);
123 }
124 
125 
126 template<class Func>
127 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
128  const __m128i* BMRESTRICT block_end,
129  const __m128i* BMRESTRICT mask_block,
130  Func sse2_func)
131 {
132  bm::id_t count = 0;
133 #ifdef BM64_SSE4
134  do
135  {
136  __m128i tmp0 = _mm_load_si128(block);
137  __m128i tmp1 = _mm_load_si128(mask_block);
138  __m128i b = sse2_func(tmp0, tmp1);
139 
140  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
141  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
142 
143  ++block; ++mask_block;
144  } while (block < block_end);
145 #else
146  do
147  {
148  __m128i tmp0 = _mm_load_si128(block);
149  __m128i tmp1 = _mm_load_si128(mask_block);
150  __m128i b = sse2_func(tmp0, tmp1);
151 
152  count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
153  count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
154  count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
155  count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
156 
157  ++block; ++mask_block;
158  } while (block < block_end);
159 #endif
160 
161  return count;
162 }
163 
164 /*!
165  @brief check if block is all zero bits
166  @ingroup SSE4
167 */
168 inline
169 bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
170 {
171  __m128i w;
172  __m128i maskz = _mm_setzero_si128();
173  const __m128i* BMRESTRICT block_end =
174  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
175 
176  do
177  {
178  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
179  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
180  return false;
181  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
182  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
183  return false;
184  block += 4;
185  } while (block < block_end);
186  return true;
187 }
188 
189 /*!
190  @brief check if digest stride is all zero bits
191  @ingroup SSE4
192 */
193 inline
194 bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
195 {
196  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
197  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
198  wA = _mm_or_si128(wA, wB);
199  bool z1 = _mm_test_all_zeros(wA, wA);
200 
201  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
202  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
203  wA = _mm_or_si128(wA, wB);
204  bool z2 = _mm_test_all_zeros(wA, wA);
205  return z1 & z2;
206 }
207 
208 
209 /*!
210  @brief AND blocks2
211  *dst &= *src
212 
213  @return 0 if no bits were set
214 
215  @ingroup SSE4
216 */
217 inline
218 unsigned sse4_and_block(__m128i* BMRESTRICT dst,
219  const __m128i* BMRESTRICT src)
220 {
221  __m128i m1A, m1B, m1C, m1D;
222  __m128i accA, accB, accC, accD;
223 
224  const __m128i* BMRESTRICT src_end =
225  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
226 
227  accA = accB = accC = accD = _mm_setzero_si128();
228 
229  do
230  {
231  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
232  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
233  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
234  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
235 
236  _mm_store_si128(dst+0, m1A);
237  _mm_store_si128(dst+1, m1B);
238  _mm_store_si128(dst+2, m1C);
239  _mm_store_si128(dst+3, m1D);
240 
241  accA = _mm_or_si128(accA, m1A);
242  accB = _mm_or_si128(accB, m1B);
243  accC = _mm_or_si128(accC, m1C);
244  accD = _mm_or_si128(accD, m1D);
245 
246  src += 4; dst += 4;
247  } while (src < src_end);
248 
249  accA = _mm_or_si128(accA, accB); // A = A | B
250  accC = _mm_or_si128(accC, accD); // C = C | D
251  accA = _mm_or_si128(accA, accC); // A = A | C
252 
253  return !_mm_testz_si128(accA, accA);
254 }
255 
256 
257 /*!
258  @brief AND block digest stride
259  *dst &= *src
260 
261  @return true if stide is all zero
262  @ingroup SSE4
263 */
264 inline
265 bool sse4_and_digest(__m128i* BMRESTRICT dst,
266  const __m128i* BMRESTRICT src)
267 {
268  __m128i m1A, m1B, m1C, m1D;
269 
270  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
271  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
272  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
273  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
274 
275  _mm_store_si128(dst+0, m1A);
276  _mm_store_si128(dst+1, m1B);
277  _mm_store_si128(dst+2, m1C);
278  _mm_store_si128(dst+3, m1D);
279 
280  m1A = _mm_or_si128(m1A, m1B);
281  m1C = _mm_or_si128(m1C, m1D);
282  m1A = _mm_or_si128(m1A, m1C);
283 
284  bool z1 = _mm_testz_si128(m1A, m1A);
285 
286  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
287  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
288  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
289  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
290 
291  _mm_store_si128(dst+4, m1A);
292  _mm_store_si128(dst+5, m1B);
293  _mm_store_si128(dst+6, m1C);
294  _mm_store_si128(dst+7, m1D);
295 
296  m1A = _mm_or_si128(m1A, m1B);
297  m1C = _mm_or_si128(m1C, m1D);
298  m1A = _mm_or_si128(m1A, m1C);
299 
300  bool z2 = _mm_testz_si128(m1A, m1A);
301 
302  return z1 & z2;
303 }
304 
305 /*!
306  @brief AND block digest stride
307  *dst = *src1 & src2
308 
309  @return true if stide is all zero
310  @ingroup SSE4
311 */
312 inline
314  const __m128i* BMRESTRICT src1,
315  const __m128i* BMRESTRICT src2)
316 {
317  __m128i m1A, m1B, m1C, m1D;
318 
319  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
320  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
321  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
322  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
323 
324  _mm_store_si128(dst+0, m1A);
325  _mm_store_si128(dst+1, m1B);
326  _mm_store_si128(dst+2, m1C);
327  _mm_store_si128(dst+3, m1D);
328 
329  m1A = _mm_or_si128(m1A, m1B);
330  m1C = _mm_or_si128(m1C, m1D);
331  m1A = _mm_or_si128(m1A, m1C);
332 
333  bool z1 = _mm_testz_si128(m1A, m1A);
334 
335  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
336  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
337  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
338  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
339 
340  _mm_store_si128(dst+4, m1A);
341  _mm_store_si128(dst+5, m1B);
342  _mm_store_si128(dst+6, m1C);
343  _mm_store_si128(dst+7, m1D);
344 
345  m1A = _mm_or_si128(m1A, m1B);
346  m1C = _mm_or_si128(m1C, m1D);
347  m1A = _mm_or_si128(m1A, m1C);
348 
349  bool z2 = _mm_testz_si128(m1A, m1A);
350 
351  return z1 & z2;
352 }
353 
354 /*!
355  @brief AND block digest stride
356  @return true if stide is all zero
357  @ingroup SSE4
358 */
359 inline
361  const __m128i* BMRESTRICT src1,
362  const __m128i* BMRESTRICT src2,
363  const __m128i* BMRESTRICT src3,
364  const __m128i* BMRESTRICT src4)
365 {
366  __m128i m1A, m1B, m1C, m1D;
367  __m128i m1E, m1F, m1G, m1H;
368 
369  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
370  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
371  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
372  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
373 
374  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
375  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
376  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
377  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
378 
379  m1A = _mm_and_si128(m1A, m1E);
380  m1B = _mm_and_si128(m1B, m1F);
381  m1C = _mm_and_si128(m1C, m1G);
382  m1D = _mm_and_si128(m1D, m1H);
383 
384  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
385  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
386  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
387  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
388 
389  _mm_store_si128(dst+0, m1A);
390  _mm_store_si128(dst+1, m1B);
391  _mm_store_si128(dst+2, m1C);
392  _mm_store_si128(dst+3, m1D);
393 
394  m1A = _mm_or_si128(m1A, m1B);
395  m1C = _mm_or_si128(m1C, m1D);
396  m1A = _mm_or_si128(m1A, m1C);
397 
398  bool z1 = _mm_testz_si128(m1A, m1A);
399 
400  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
401  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
402  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
403  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
404 
405  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
406  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
407  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
408  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
409 
410  m1A = _mm_and_si128(m1A, m1E);
411  m1B = _mm_and_si128(m1B, m1F);
412  m1C = _mm_and_si128(m1C, m1G);
413  m1D = _mm_and_si128(m1D, m1H);
414 
415  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
416  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
417  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
418  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
419 
420  _mm_store_si128(dst+4, m1A);
421  _mm_store_si128(dst+5, m1B);
422  _mm_store_si128(dst+6, m1C);
423  _mm_store_si128(dst+7, m1D);
424 
425  m1A = _mm_or_si128(m1A, m1B);
426  m1C = _mm_or_si128(m1C, m1D);
427  m1A = _mm_or_si128(m1A, m1C);
428 
429  bool z2 = _mm_testz_si128(m1A, m1A);
430 
431  return z1 & z2;
432 }
433 
434 
435 /*!
436  @brief SUB (AND NOT) block digest stride
437  *dst &= ~*src
438 
439  @return true if stide is all zero
440  @ingroup SSE4
441 */
442 inline
443 bool sse4_sub_digest(__m128i* BMRESTRICT dst,
444  const __m128i* BMRESTRICT src)
445 {
446  __m128i m1A, m1B, m1C, m1D;
447 
448  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
449  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
450  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
451  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
452 
453  _mm_store_si128(dst+0, m1A);
454  _mm_store_si128(dst+1, m1B);
455  _mm_store_si128(dst+2, m1C);
456  _mm_store_si128(dst+3, m1D);
457 
458  m1A = _mm_or_si128(m1A, m1B);
459  m1C = _mm_or_si128(m1C, m1D);
460  m1A = _mm_or_si128(m1A, m1C);
461 
462  bool z1 = _mm_testz_si128(m1A, m1A);
463 
464  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
465  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
466  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
467  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
468 
469  _mm_store_si128(dst+4, m1A);
470  _mm_store_si128(dst+5, m1B);
471  _mm_store_si128(dst+6, m1C);
472  _mm_store_si128(dst+7, m1D);
473 
474  m1A = _mm_or_si128(m1A, m1B);
475  m1C = _mm_or_si128(m1C, m1D);
476  m1A = _mm_or_si128(m1A, m1C);
477 
478  bool z2 = _mm_testz_si128(m1A, m1A);
479 
480  return z1 & z2;
481 }
482 
483 
484 
485 /*!
486  @brief check if block is all zero bits
487  @ingroup SSE4
488 */
489 inline
490 bool sse4_is_all_one(const __m128i* BMRESTRICT block)
491 {
492  __m128i w;
493  const __m128i* BMRESTRICT block_end =
494  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
495 
496  do
497  {
498  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
499  if (!_mm_test_all_ones(w))
500  return false;
501  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
502  if (!_mm_test_all_ones(w))
503  return false;
504 
505  block+=4;
506  } while (block < block_end);
507  return true;
508 }
509 
510 /*!
511  @brief check if wave of pointers is all NULL
512  @ingroup SSE4
513 */
515 bool sse42_test_all_zero_wave(const void* ptr)
516 {
517  __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
518  return _mm_testz_si128(w0, w0);
519 }
520 
521 /*!
522  SSE4.2 calculate number of bit changes from 0 to 1
523  @ingroup SSE4
524 */
525 inline
526 unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block)
527 {
528  const __m128i* block_end =
529  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
530  __m128i m1COshft, m2COshft;
531 
532  unsigned w0 = *((bm::word_t*)(block));
533  unsigned count = 1;
534 
535  unsigned co2, co1 = 0;
536  for (;block < block_end; block += 2)
537  {
538  __m128i m1A = _mm_load_si128(block);
539  __m128i m2A = _mm_load_si128(block+1);
540 
541  __m128i m1CO = _mm_srli_epi32(m1A, 31);
542  __m128i m2CO = _mm_srli_epi32(m2A, 31);
543 
544  co2 = _mm_extract_epi32(m1CO, 3);
545 
546  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
547  __m128i m2As = _mm_slli_epi32(m2A, 1);
548 
549  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
550  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
551 
552  co1 = co2;
553 
554  co2 = _mm_extract_epi32(m2CO, 3);
555 
556  m2COshft = _mm_slli_si128 (m2CO, 4);
557  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
558 
559  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
560  m2As = _mm_or_si128(m2As, m2COshft);
561 
562  co1 = co2;
563 
564  // we now have two shifted SSE4 regs with carry-over
565  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
566  m2A = _mm_xor_si128(m2A, m2As);
567 
568  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
569  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
570  count += _mm_popcnt_u64(m0) + _mm_popcnt_u64(m1);
571 
572  m0 = _mm_extract_epi64(m2A, 0);
573  m1 = _mm_extract_epi64(m2A, 1);
574  count += _mm_popcnt_u64(m0) + _mm_popcnt_u64(m1);
575  }
576  count -= (w0 & 1u); // correct initial carry-in error
577  return count;
578 }
579 
580 
581 #ifdef __GNUG__
582 // necessary measure to silence false warning from GCC about negative pointer arithmetics
583 #pragma GCC diagnostic push
584 #pragma GCC diagnostic ignored "-Warray-bounds"
585 #endif
586 
587 /*!
588  SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
589  @ingroup SSE4
590  \internal
591 */
592 inline
593 unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
594 {
595  BM_ASSERT(size <= 16);
596  BM_ASSERT(size);
597 
598  const unsigned unroll_factor = 8;
599  if (size < 4) // for very short vector use conventional scan
600  {
601  unsigned j;
602  for (j = 0; j < size; ++j)
603  {
604  if (pbuf[j] >= pos)
605  break;
606  }
607  return j;
608  }
609 
610  __m128i m1, mz, maskF, maskFL;
611 
612  mz = _mm_setzero_si128();
613  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
614 
615  maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
616  maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
617  int shiftL= (64 - (unroll_factor - size) * 16);
618  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
619 
620  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
621  m1 = _mm_or_si128(m1, maskFL);
622 
623  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
624  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
625  __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
626  int mi = _mm_movemask_epi8(c_mask); // collect flag bits
627  if (mi)
628  {
629  // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
630  unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
631  return unroll_factor - bc; // address of first one element (target)
632  }
633  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
634  // GCC gives a false warning for "- unroll_factor" here
635  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
636  BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
637 
638  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
639  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
640  mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
641  unsigned bc = _mm_popcnt_u32(mi);
642 
643  return size - bc;
644 }
645 
646 /**
647  Experimental (test) function to do SIMD vector search (lower bound)
648  in sorted, growing array
649  @ingroup SSE4
650 
651  \internal
652 */
653 inline
654 int sse42_cmpge_u32(__m128i vect4, unsigned value)
655 {
656  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
657  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
658  //
659  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
660  __m128i mm_val = _mm_set1_epi32(value);
661 
662  __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
663  __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
664 
665  __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
666  __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
667 
668  __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
669  int mask = _mm_movemask_epi8(cmp_mask_ge);
670  if (mask)
671  {
672  int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
673  return bsf / 4;
674  }
675  return -1;
676 }
677 
678 
679 /**
680  lower bound (great or equal) linear scan in ascending order sorted array
681  @ingroup SSE4
682  \internal
683 */
684 inline
685 unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
686  unsigned target,
687  unsigned from,
688  unsigned to)
689 {
690  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
691  // see more at:
692  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
693 
694  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
695 
696  unsigned unroll_factor = 8;
697  unsigned len = to - from + 1;
698  unsigned len_unr = len - (len % unroll_factor);
699 
700  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
701  __m128i vect_target = _mm_set1_epi32(target);
702  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
703 
704  int mask;
705  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
706 
707  unsigned k = 0;
708  for (; k < len_unr; k+=unroll_factor)
709  {
710  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
711  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
712 
713  cmp_mask_ge = _mm_or_si128( // GT | EQ
714  _mm_cmpgt_epi32 (norm_vect40, norm_target),
715  _mm_cmpeq_epi32 (vect40, vect_target)
716  );
717  mask = _mm_movemask_epi8(cmp_mask_ge);
718  if (mask)
719  {
720  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
721  return from + k + (bsf / 4);
722  }
723  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
724  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
725 
726  cmp_mask_ge = _mm_or_si128(
727  _mm_cmpgt_epi32 (norm_vect41, norm_target),
728  _mm_cmpeq_epi32 (vect41, vect_target)
729  );
730  mask = _mm_movemask_epi8(cmp_mask_ge);
731  if (mask)
732  {
733  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
734  return 4 + from + k + (bsf / 4);
735  }
736  } // for
737 
738  for (; k < len; ++k)
739  {
740  if (arr_base[k] >= target)
741  return from + k;
742  }
743  return to + 1;
744 }
745 
746 
747 
748 /*!
749  SSE4.2 index lookup to check what belongs to the same block (8 elements)
750  \internal
751 */
752 inline
753 unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
754  unsigned nb, unsigned start)
755 {
756  const unsigned unroll_factor = 8;
757  const unsigned len = (size - start);
758  const unsigned len_unr = len - (len % unroll_factor);
759  unsigned k;
760 
761  idx += start;
762 
763  __m128i nbM = _mm_set1_epi32(nb);
764 
765  for (k = 0; k < len_unr; k+=unroll_factor)
766  {
767  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
768  __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
769  __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
770  __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
771 
772  if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
773  !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
774  break;
775 
776  } // for k
777  for (; k < len; ++k)
778  {
779  if (nb != unsigned(idx[k] >> bm::set_block_shift))
780  break;
781  }
782  return start + k;
783 }
784 
785 /*!
786  SSE4.2 bulk bit set
787  \internal
788 */
789 inline
791  const unsigned* BMRESTRICT idx,
792  unsigned start, unsigned stop )
793 {
794  const unsigned unroll_factor = 4;
795  const unsigned len = (stop - start);
796  const unsigned len_unr = len - (len % unroll_factor);
797 
798  idx += start;
799 
800  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
801  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
802 
803  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
804  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
805 
806  unsigned k = 0;
807  for (; k < len_unr; k+=unroll_factor)
808  {
809  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
810  __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
811  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
812 
813 
814  nbitA = _mm_and_si128 (nbitA, sw_mask);
815  _mm_store_si128 ((__m128i*)mshift_v, nbitA);
816 
817  // check-compare if all 4 bits are in the very same word
818  //
819  __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
820  __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
821  if (_mm_test_all_ones(cmpA)) // check if all are in one word
822  {
823  unsigned nword = _mm_extract_epi32(nwordA, 0);
824  block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
825  |(1u << mshift_v[2]) | (1u << mshift_v[3]);
826  }
827  else // bits are in different words, use scalar scatter
828  {
829  _mm_store_si128 ((__m128i*)mword_v, nwordA);
830 
831  block[mword_v[0]] |= (1u << mshift_v[0]);
832  block[mword_v[1]] |= (1u << mshift_v[1]);
833  block[mword_v[2]] |= (1u << mshift_v[2]);
834  block[mword_v[3]] |= (1u << mshift_v[3]);
835  }
836 
837  } // for k
838 
839  for (; k < len; ++k)
840  {
841  unsigned n = idx[k];
842  unsigned nbit = unsigned(n & bm::set_block_mask);
843  unsigned nword = nbit >> bm::set_word_shift;
844  nbit &= bm::set_word_mask;
845  block[nword] |= (1u << nbit);
846  } // for k
847 }
848 
849 
850 /*!
851  SSE4.2 bit block gather-scatter
852 
853  @param arr - destination array to set bits
854  @param blk - source bit-block
855  @param idx - gather index array
856  @param size - gather array size
857  @param start - gaher start index
858  @param bit_idx - bit to set in the target array
859 
860  \internal
861 
862  C algorithm:
863 
864  for (unsigned k = start; k < size; ++k)
865  {
866  nbit = unsigned(idx[k] & bm::set_block_mask);
867  nword = unsigned(nbit >> bm::set_word_shift);
868  mask0 = 1u << (nbit & bm::set_word_mask);
869  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
870  }
871 
872 */
873 inline
875  const unsigned* BMRESTRICT blk,
876  const unsigned* BMRESTRICT idx,
877  unsigned size,
878  unsigned start,
879  unsigned bit_idx)
880 {
881  const unsigned unroll_factor = 4;
882  const unsigned len = (size - start);
883  const unsigned len_unr = len - (len % unroll_factor);
884 
885  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
886  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
887  __m128i maskFF = _mm_set1_epi32(~0u);
888  __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
889 
890  __m128i mask_tmp, mask_0;
891 
892  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
893  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
894 
895  unsigned k = 0;
896  unsigned base = start + k;
897  __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
898  __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
899  for (; k < len_unr; k+=unroll_factor)
900  {
901  __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
902  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
903  // (nbit & bm::set_word_mask)
904  _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
905  _mm_store_si128 ((__m128i*)mword_v, nwordA);
906 
907  // mask0 = 1u << (nbit & bm::set_word_mask);
908  //
909 #if 0
910  // ifdefed an alternative SHIFT implementation using SSE and masks
911  // (it is not faster than just doing scalar operations)
912  {
913  __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
914  __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
915  mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
916  mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
917  mask_0 = _mm_or_si128 (mask_0, mask_tmp);
918 
919  __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
920  _mm_slli_si128 (am_0, 8));
921  mask_tmp = _mm_and_si128 (
922  _mm_slli_epi32(mask1, mshift_v[3]),
923  _mm_slli_si128 (am_0, 12)
924  );
925 
926  mask_0 = _mm_or_si128 (mask_0,
927  _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
928  }
929 #endif
930  mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
931 
932 
933  // gather for: blk[nword] (.. & mask0 )
934  //
935  mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
936  blk[mword_v[1]], blk[mword_v[0]]),
937  mask_0);
938 
939  // bool(blk[nword] ...)
940  //maskFF = _mm_set1_epi32(~0u);
941  mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
942  mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
943  mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
944 
945  mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
946 
947  _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
948  _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
949 
950  ++idx_ptr; ++target_ptr;
951  _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
952  }
953 
954  for (; k < len; ++k)
955  {
956  base = start + k;
957  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
958  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
959  }
960 
961 }
962 
963 /*!
964  @brief block shift right by 1
965  @ingroup SSE4
966 */
967 inline
968 bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1)
969 {
970  __m128i* block_end =
971  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
972  __m128i m1COshft, m2COshft;
973  __m128i mAcc = _mm_set1_epi32(0);
974 
975  unsigned co2;
976 
977  for (;block < block_end; block += 2)
978  {
979  __m128i m1A = _mm_load_si128(block);
980  __m128i m2A = _mm_load_si128(block+1);
981 
982  __m128i m1CO = _mm_srli_epi32(m1A, 31);
983  __m128i m2CO = _mm_srli_epi32(m2A, 31);
984 
985  co2 = _mm_extract_epi32(m1CO, 3);
986 
987  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
988  m2A = _mm_slli_epi32(m2A, 1);
989 
990  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
991  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
992 
993  co1 = co2;
994 
995  co2 = _mm_extract_epi32(m2CO, 3);
996 
997  m2COshft = _mm_slli_si128 (m2CO, 4);
998  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
999 
1000  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1001  m2A = _mm_or_si128(m2A, m2COshft);
1002 
1003  _mm_store_si128(block, m1A);
1004  _mm_store_si128(block+1, m2A);
1005 
1006  mAcc = _mm_or_si128(mAcc, m1A);
1007  mAcc = _mm_or_si128(mAcc, m2A);
1008 
1009  co1 = co2;
1010  }
1011  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1012  return co1;
1013 }
1014 
1015 
1016 /*!
1017  @brief block shift right by 1 plus AND
1018 
1019  @return carry over flag
1020  @ingroup SSE4
1021 */
1022 inline
1023 bool sse42_shift_r1_and(__m128i* block,
1024  bm::word_t co1,
1025  const __m128i* BMRESTRICT mask_block,
1026  bm::id64_t* digest)
1027 {
1028  bm::word_t* wblock = (bm::word_t*) block;
1029  const bm::word_t* mblock = (const bm::word_t*) mask_block;
1030 
1031  __m128i m1COshft, m2COshft;
1032  __m128i mAcc = _mm_set1_epi32(0);
1033  unsigned co2;
1034 
1035  bm::id64_t d, wd;
1036  wd = d = *digest;
1037 
1038  unsigned di = 0;
1039  if (!co1)
1040  {
1041  bm::id64_t t = d & -d;
1042  di = _mm_popcnt_u64(t - 1); // find start bit-index
1043  }
1044 
1045  for (; di < 64 ; ++di)
1046  {
1047  const unsigned d_base = di * bm::set_block_digest_wave_size;
1048  bm::id64_t dmask = (1ull << di);
1049  if (d & dmask) // digest stride NOT empty
1050  {
1051  block = (__m128i*) &wblock[d_base];
1052  mask_block = (__m128i*) &mblock[d_base];
1053  for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1054  {
1055  __m128i m1A = _mm_load_si128(block);
1056  __m128i m2A = _mm_load_si128(block+1);
1057 
1058  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1059  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1060 
1061  co2 = _mm_extract_epi32(m1CO, 3);
1062 
1063  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1064  m2A = _mm_slli_epi32(m2A, 1);
1065 
1066  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1067  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1068 
1069  co1 = co2;
1070 
1071  co2 = _mm_extract_epi32(m2CO, 3);
1072 
1073  m2COshft = _mm_slli_si128 (m2CO, 4);
1074  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1075 
1076  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1077  m2A = _mm_or_si128(m2A, m2COshft);
1078 
1079  m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1080  m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1081 
1082  mAcc = _mm_or_si128(mAcc, m1A);
1083  mAcc = _mm_or_si128(mAcc, m2A);
1084 
1085  _mm_store_si128(block, m1A);
1086  _mm_store_si128(block+1, m2A);
1087 
1088  co1 = co2;
1089 
1090  } // for i
1091 
1092  if (_mm_testz_si128(mAcc, mAcc))
1093  d &= ~dmask; // clear digest bit
1094  wd &= wd - 1;
1095  }
1096  else
1097  {
1098  if (co1)
1099  {
1100  BM_ASSERT(co1 == 1);
1101  BM_ASSERT(wblock[d_base] == 0);
1102 
1103  unsigned w0 = wblock[d_base] = co1 & mblock[d_base];
1104  d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1105  co1 = 0;
1106  }
1107  if (!wd) // digest is empty, no CO -> exit
1108  break;
1109  }
1110  } // for di
1111 
1112  *digest = d;
1113  return co1;
1114 }
1115 
1116 
1117 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1118  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1119 
1120 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1121  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1122 
1123 #define VECT_BITCOUNT(first, last) \
1124  sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1125 
1126 #define VECT_BITCOUNT_AND(first, last, mask) \
1127  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1128 
1129 #define VECT_BITCOUNT_OR(first, last, mask) \
1130  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1131 
1132 #define VECT_BITCOUNT_XOR(first, last, mask) \
1133  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1134 
1135 #define VECT_BITCOUNT_SUB(first, last, mask) \
1136  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1137 
1138 #define VECT_INVERT_BLOCK(first) \
1139  sse2_invert_block((__m128i*)first);
1140 
1141 #define VECT_AND_BLOCK(dst, src) \
1142  sse4_and_block((__m128i*) dst, (__m128i*) (src))
1143 
1144 #define VECT_AND_DIGEST(dst, src) \
1145  sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1146 
1147 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1148  sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1149 
1150 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1151  sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1152 
1153 #define VECT_OR_BLOCK(dst, src) \
1154  sse2_or_block((__m128i*) dst, (__m128i*) (src))
1155 
1156 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1157  sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1158 
1159 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1160  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1161 
1162 #define VECT_SUB_BLOCK(dst, src) \
1163  sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1164 
1165 #define VECT_SUB_DIGEST(dst, src) \
1166  sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1167 
1168 #define VECT_XOR_ARR(dst, src, src_end) \
1169  sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
1170 
1171 #define VECT_COPY_BLOCK(dst, src) \
1172  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1173 
1174 #define VECT_STREAM_BLOCK(dst, src) \
1175  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1176 
1177 #define VECT_SET_BLOCK(dst, value) \
1178  sse2_set_block((__m128i*) dst, value)
1179 
1180 #define VECT_IS_ZERO_BLOCK(dst) \
1181  sse4_is_all_zero((__m128i*) dst)
1182 
1183 #define VECT_IS_ONE_BLOCK(dst) \
1184  sse4_is_all_one((__m128i*) dst)
1185 
1186 #define VECT_IS_DIGEST_ZERO(start) \
1187  sse4_is_digest_zero((__m128i*)start)
1188 
1189 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1190  sse4_lower_bound_scan_u32(arr, target, from, to)
1191 
1192 #define VECT_SHIFT_R1(b, acc, co) \
1193  sse42_shift_r1((__m128i*)b, acc, co)
1194 
1195 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
1196  sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1197 
1198 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1199  sse42_idx_arr_block_lookup(idx, size, nb, start)
1200 
1201 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1202  sse42_set_block_bits(block, idx, start, stop)
1203 
1204 #define VECT_BLOCK_CHANGE(block) \
1205  sse42_bit_block_calc_change((__m128i*)block)
1206 
1207 
1208 #ifdef __GNUG__
1209 #pragma GCC diagnostic pop
1210 #endif
1211 
1212 
1213 #ifdef __GNUG__
1214 #pragma GCC diagnostic pop
1215 #endif
1216 
1217 
1218 } // namespace
1219 
1220 
1221 
1222 
1223 #endif
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse4.h:685
const unsigned set_block_size
Definition: bmconst.h:47
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse4.h:72
const unsigned set_word_shift
Definition: bmconst.h:59
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
Definition: bmsse4.h:968
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:169
unsigned long long int id64_t
Definition: bmconst.h:31
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:313
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block)
Definition: bmsse4.h:526
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse4.h:127
Definition: bm.h:69
#define BM_ALIGN16
Definition: bmdef.h:273
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:490
Compute functions for SSE SIMD instruction set (internal)
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition: bmsse4.h:790
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition: bmsse4.h:120
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmsse4.h:515
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmsse4.h:753
unsigned int word_t
Definition: bmconst.h:35
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array...
Definition: bmsse4.h:654
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse4.h:593
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition: bmsse4.h:111
unsigned short gap_word_t
Definition: bmconst.h:65
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmsse4.h:265
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
Definition: bmsse4.h:360
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmsse4.h:874
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmsse4.h:194
Definitions(internal)
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:443
#define BM_ALIGN16ATTR
Definition: bmdef.h:274
const unsigned set_block_mask
Definition: bmconst.h:49
const unsigned set_word_mask
Definition: bmconst.h:60
#define BMFORCEINLINE
Definition: bmdef.h:189
unsigned int id_t
Definition: bmconst.h:34
#define BM_ASSERT
Definition: bmdef.h:116
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition: bmsse4.h:101
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
Definition: bmsse4.h:1023
const unsigned set_block_shift
Definition: bmconst.h:48
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse4.h:218
Bit manipulation primitives (internal)
#define BMRESTRICT
Definition: bmdef.h:179
const unsigned set_block_digest_wave_size
Definition: bmconst.h:55