BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse4.h
22  \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 #include<smmintrin.h>
28 #include<nmmintrin.h>
29 #include<immintrin.h>
30 
31 #include "bmdef.h"
32 #include "bmsse_util.h"
33 #include "bmutil.h"
34 
35 namespace bm
36 {
37 
38 /** @defgroup SSE4 SSE4.2 funcions (internal)
39  Processor specific optimizations for SSE4.2 instructions (internals)
40  @internal
41  @ingroup bvector
42  */
43 
44 #ifdef __GNUG__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
47 #endif
48 
49 #ifdef _MSC_VER
50 #pragma warning( push )
51 #pragma warning( disable : 4146)
52 #endif
53 
54 
55 /*
56 inline
57 void sse2_print128(const char* prefix, const __m128i & value)
58 {
59  const size_t n = sizeof(__m128i) / sizeof(unsigned);
60  unsigned buffer[n];
61  _mm_storeu_si128((__m128i*)buffer, value);
62  std::cout << prefix << " [ ";
63  for (int i = n-1; 1; --i)
64  {
65  std::cout << buffer[i] << " ";
66  if (i == 0)
67  break;
68  }
69  std::cout << "]" << std::endl;
70 }
71 */
72 
73 /*!
74  SSE4.2 optimized bitcounting .
75  @ingroup SSE4
76 */
77 inline
78 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
79 {
80  bm::id_t count = 0;
81 #ifdef BM64_SSE4
82  const bm::id64_t* b = (bm::id64_t*) block;
83  const bm::id64_t* b_end = (bm::id64_t*) block_end;
84  do
85  {
86  count += unsigned( _mm_popcnt_u64(b[0]) +
87  _mm_popcnt_u64(b[1]));
88  b += 2;
89  } while (b < b_end);
90 #else
91  do
92  {
93  const unsigned* b = (unsigned*) block;
94  count += _mm_popcnt_u32(b[0]) +
95  _mm_popcnt_u32(b[1]) +
96  _mm_popcnt_u32(b[2]) +
97  _mm_popcnt_u32(b[3]);
98  } while (++block < block_end);
99 #endif
100  return count;
101 }
102 
103 /*!
104 \internal
105 */
107 unsigned op_xor(unsigned a, unsigned b)
108 {
109  unsigned ret = (a ^ b);
110  return ret;
111 }
112 
113 /*!
114 \internal
115 */
117 unsigned op_or(unsigned a, unsigned b)
118 {
119  return (a | b);
120 }
121 
122 /*!
123 \internal
124 */
126 unsigned op_and(unsigned a, unsigned b)
127 {
128  return (a & b);
129 }
130 
131 
132 template<class Func>
133 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
134  const __m128i* BMRESTRICT block_end,
135  const __m128i* BMRESTRICT mask_block,
136  Func sse2_func)
137 {
138  bm::id_t count = 0;
139 #ifdef BM64_SSE4
140  do
141  {
142  __m128i tmp0 = _mm_load_si128(block);
143  __m128i tmp1 = _mm_load_si128(mask_block);
144  __m128i b = sse2_func(tmp0, tmp1);
145 
146  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
148 
149  ++block; ++mask_block;
150  } while (block < block_end);
151 #else
152  do
153  {
154  __m128i tmp0 = _mm_load_si128(block);
155  __m128i tmp1 = _mm_load_si128(mask_block);
156  __m128i b = sse2_func(tmp0, tmp1);
157 
158  count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159  count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160  count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161  count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
162 
163  ++block; ++mask_block;
164  } while (block < block_end);
165 #endif
166 
167  return count;
168 }
169 
170 /*!
171  @brief check if block is all zero bits
172  @ingroup SSE4
173 */
174 inline
175 bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
176 {
177  __m128i w;
178  __m128i maskz = _mm_setzero_si128();
179  const __m128i* BMRESTRICT block_end =
180  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
181 
182  do
183  {
184  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
186  return false;
187  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
189  return false;
190  block += 4;
191  } while (block < block_end);
192  return true;
193 }
194 
195 /*!
196  @brief check if digest stride is all zero bits
197  @ingroup SSE4
198 */
199 inline
200 bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
201 {
202  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204  wA = _mm_or_si128(wA, wB);
205  bool z1 = _mm_test_all_zeros(wA, wA);
206 
207  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209  wA = _mm_or_si128(wA, wB);
210  bool z2 = _mm_test_all_zeros(wA, wA);
211  return z1 & z2;
212 }
213 
214 /*!
215  @brief set digest stride to 0xFF.. or 0x0 value
216  @ingroup SSE4
217 */
218 inline
219 void sse4_block_set_digest(__m128i* dst, unsigned value)
220 {
221  __m128i mV = _mm_set1_epi32(int(value));
222  _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223  _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224  _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225  _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
226 }
227 
228 
229 /*!
230  @brief AND blocks2
231  *dst &= *src
232 
233  @return 0 if no bits were set
234  @ingroup SSE4
235 */
236 inline
237 unsigned sse4_and_block(__m128i* BMRESTRICT dst,
238  const __m128i* BMRESTRICT src)
239 {
240  __m128i m1A, m1B, m1C, m1D;
241  __m128i accA, accB, accC, accD;
242 
243  const __m128i* BMRESTRICT src_end =
244  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
245 
246  accA = accB = accC = accD = _mm_setzero_si128();
247 
248  do
249  {
250  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
254 
255  _mm_store_si128(dst+0, m1A);
256  _mm_store_si128(dst+1, m1B);
257  _mm_store_si128(dst+2, m1C);
258  _mm_store_si128(dst+3, m1D);
259 
260  accA = _mm_or_si128(accA, m1A);
261  accB = _mm_or_si128(accB, m1B);
262  accC = _mm_or_si128(accC, m1C);
263  accD = _mm_or_si128(accD, m1D);
264 
265  src += 4; dst += 4;
266  } while (src < src_end);
267 
268  accA = _mm_or_si128(accA, accB); // A = A | B
269  accC = _mm_or_si128(accC, accD); // C = C | D
270  accA = _mm_or_si128(accA, accC); // A = A | C
271 
272  return !_mm_testz_si128(accA, accA);
273 }
274 
275 
276 /*!
277  @brief AND block digest stride
278  *dst &= *src
279 
280  @return true if stide is all zero
281  @ingroup SSE4
282 */
283 inline
284 bool sse4_and_digest(__m128i* BMRESTRICT dst,
285  const __m128i* BMRESTRICT src)
286 {
287  __m128i m1A, m1B, m1C, m1D;
288 
289  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
293 
294  _mm_store_si128(dst+0, m1A);
295  _mm_store_si128(dst+1, m1B);
296  _mm_store_si128(dst+2, m1C);
297  _mm_store_si128(dst+3, m1D);
298 
299  m1A = _mm_or_si128(m1A, m1B);
300  m1C = _mm_or_si128(m1C, m1D);
301  m1A = _mm_or_si128(m1A, m1C);
302 
303  bool z1 = _mm_testz_si128(m1A, m1A);
304 
305  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
309 
310  _mm_store_si128(dst+4, m1A);
311  _mm_store_si128(dst+5, m1B);
312  _mm_store_si128(dst+6, m1C);
313  _mm_store_si128(dst+7, m1D);
314 
315  m1A = _mm_or_si128(m1A, m1B);
316  m1C = _mm_or_si128(m1C, m1D);
317  m1A = _mm_or_si128(m1A, m1C);
318 
319  bool z2 = _mm_testz_si128(m1A, m1A);
320 
321  return z1 & z2;
322 }
323 
324 /*!
325  @brief AND block digest stride
326  *dst = *src1 & src2
327 
328  @return true if stide is all zero
329  @ingroup SSE4
330 */
331 inline
333  const __m128i* BMRESTRICT src1,
334  const __m128i* BMRESTRICT src2)
335 {
336  __m128i m1A, m1B, m1C, m1D;
337 
338  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
342 
343  _mm_store_si128(dst+0, m1A);
344  _mm_store_si128(dst+1, m1B);
345  _mm_store_si128(dst+2, m1C);
346  _mm_store_si128(dst+3, m1D);
347 
348  m1A = _mm_or_si128(m1A, m1B);
349  m1C = _mm_or_si128(m1C, m1D);
350  m1A = _mm_or_si128(m1A, m1C);
351 
352  bool z1 = _mm_testz_si128(m1A, m1A);
353 
354  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
358 
359  _mm_store_si128(dst+4, m1A);
360  _mm_store_si128(dst+5, m1B);
361  _mm_store_si128(dst+6, m1C);
362  _mm_store_si128(dst+7, m1D);
363 
364  m1A = _mm_or_si128(m1A, m1B);
365  m1C = _mm_or_si128(m1C, m1D);
366  m1A = _mm_or_si128(m1A, m1C);
367 
368  bool z2 = _mm_testz_si128(m1A, m1A);
369 
370  return z1 & z2;
371 }
372 
373 /*!
374  @brief AND block digest stride
375  @return true if stide is all zero
376  @ingroup SSE4
377 */
378 inline
380  const __m128i* BMRESTRICT src1,
381  const __m128i* BMRESTRICT src2,
382  const __m128i* BMRESTRICT src3,
383  const __m128i* BMRESTRICT src4)
384 {
385  __m128i m1A, m1B, m1C, m1D;
386  __m128i m1E, m1F, m1G, m1H;
387 
388  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
392 
393  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
397 
398  m1A = _mm_and_si128(m1A, m1E);
399  m1B = _mm_and_si128(m1B, m1F);
400  m1C = _mm_and_si128(m1C, m1G);
401  m1D = _mm_and_si128(m1D, m1H);
402 
403  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
407 
408  _mm_store_si128(dst+0, m1A);
409  _mm_store_si128(dst+1, m1B);
410  _mm_store_si128(dst+2, m1C);
411  _mm_store_si128(dst+3, m1D);
412 
413  m1A = _mm_or_si128(m1A, m1B);
414  m1C = _mm_or_si128(m1C, m1D);
415  m1A = _mm_or_si128(m1A, m1C);
416 
417  bool z1 = _mm_testz_si128(m1A, m1A);
418 
419  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
423 
424  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
428 
429  m1A = _mm_and_si128(m1A, m1E);
430  m1B = _mm_and_si128(m1B, m1F);
431  m1C = _mm_and_si128(m1C, m1G);
432  m1D = _mm_and_si128(m1D, m1H);
433 
434  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
438 
439  _mm_store_si128(dst+4, m1A);
440  _mm_store_si128(dst+5, m1B);
441  _mm_store_si128(dst+6, m1C);
442  _mm_store_si128(dst+7, m1D);
443 
444  m1A = _mm_or_si128(m1A, m1B);
445  m1C = _mm_or_si128(m1C, m1D);
446  m1A = _mm_or_si128(m1A, m1C);
447 
448  bool z2 = _mm_testz_si128(m1A, m1A);
449 
450  return z1 & z2;
451 }
452 
453 
454 /*!
455  @brief SUB (AND NOT) block digest stride
456  *dst &= ~*src
457 
458  @return true if stide is all zero
459  @ingroup SSE4
460 */
461 inline
462 bool sse4_sub_digest(__m128i* BMRESTRICT dst,
463  const __m128i* BMRESTRICT src)
464 {
465  __m128i m1A, m1B, m1C, m1D;
466 
467  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
471 
472  _mm_store_si128(dst+0, m1A);
473  _mm_store_si128(dst+1, m1B);
474  _mm_store_si128(dst+2, m1C);
475  _mm_store_si128(dst+3, m1D);
476 
477  m1A = _mm_or_si128(m1A, m1B);
478  m1C = _mm_or_si128(m1C, m1D);
479  m1A = _mm_or_si128(m1A, m1C);
480 
481  bool z1 = _mm_testz_si128(m1A, m1A);
482 
483  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
487 
488  _mm_store_si128(dst+4, m1A);
489  _mm_store_si128(dst+5, m1B);
490  _mm_store_si128(dst+6, m1C);
491  _mm_store_si128(dst+7, m1D);
492 
493  m1A = _mm_or_si128(m1A, m1B);
494  m1C = _mm_or_si128(m1C, m1D);
495  m1A = _mm_or_si128(m1A, m1C);
496 
497  bool z2 = _mm_testz_si128(m1A, m1A);
498 
499  return z1 & z2;
500 }
501 
502 
503 /*!
504  @brief 2-operand SUB (AND NOT) block digest stride
505  *dst = src1 & ~*src2
506 
507  @return true if stide is all zero
508  @ingroup SSE4
509 */
510 inline
512  const __m128i* BMRESTRICT src1,
513  const __m128i* BMRESTRICT src2)
514 {
515  __m128i m1A, m1B, m1C, m1D;
516 
517  m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518  m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519  m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520  m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
521 
522  _mm_store_si128(dst+0, m1A);
523  _mm_store_si128(dst+1, m1B);
524  _mm_store_si128(dst+2, m1C);
525  _mm_store_si128(dst+3, m1D);
526 
527  m1A = _mm_or_si128(m1A, m1B);
528  m1C = _mm_or_si128(m1C, m1D);
529  m1A = _mm_or_si128(m1A, m1C);
530 
531  bool z1 = _mm_testz_si128(m1A, m1A);
532 
533  m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534  m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535  m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536  m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
537 
538  _mm_store_si128(dst+4, m1A);
539  _mm_store_si128(dst+5, m1B);
540  _mm_store_si128(dst+6, m1C);
541  _mm_store_si128(dst+7, m1D);
542 
543  m1A = _mm_or_si128(m1A, m1B);
544  m1C = _mm_or_si128(m1C, m1D);
545  m1A = _mm_or_si128(m1A, m1C);
546 
547  bool z2 = _mm_testz_si128(m1A, m1A);
548 
549  return z1 & z2;
550 }
551 
552 
553 
554 /*!
555  @brief check if block is all zero bits
556  @ingroup SSE4
557 */
558 inline
559 bool sse4_is_all_one(const __m128i* BMRESTRICT block)
560 {
561  __m128i w;
562  const __m128i* BMRESTRICT block_end =
563  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
564 
565  do
566  {
567  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568  if (!_mm_test_all_ones(w))
569  return false;
570  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571  if (!_mm_test_all_ones(w))
572  return false;
573 
574  block+=4;
575  } while (block < block_end);
576  return true;
577 }
578 
579 /*!
580  @brief check if wave of pointers is all NULL
581  @ingroup SSE4
582 */
584 bool sse42_test_all_zero_wave(const void* ptr)
585 {
586  __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
587  return _mm_testz_si128(w0, w0);
588 }
589 
590 /*!
591  @brief check if 2 waves of pointers are all NULL
592  @ingroup SSE4
593 */
595 bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1)
596 {
597  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
598  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
599  w0 = _mm_or_si128(w0, w1);
600  return _mm_testz_si128(w0, w0);
601 }
602 
603 /*!
604  @brief check if wave of 2 pointers are the same (null or FULL)
605  @ingroup SSE4
606 */
608 bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1)
609 {
610  __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
611  __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
612  w0 = _mm_xor_si128(w0, w1);
613  return _mm_testz_si128(w0, w0);
614 }
615 
616 
617 /*!
618  SSE4.2 calculate number of bit changes from 0 to 1
619  @ingroup SSE4
620 */
621 inline
622 unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block)
623 {
624  const __m128i* block_end =
625  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
626  __m128i m1COshft, m2COshft;
627 
628  unsigned w0 = *((bm::word_t*)(block));
629  unsigned count = 1;
630 
631  unsigned co2, co1 = 0;
632  for (;block < block_end; block += 2)
633  {
634  __m128i m1A = _mm_load_si128(block);
635  __m128i m2A = _mm_load_si128(block+1);
636 
637  __m128i m1CO = _mm_srli_epi32(m1A, 31);
638  __m128i m2CO = _mm_srli_epi32(m2A, 31);
639 
640  co2 = _mm_extract_epi32(m1CO, 3);
641 
642  __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
643  __m128i m2As = _mm_slli_epi32(m2A, 1);
644 
645  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
646  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
647 
648  co1 = co2;
649 
650  co2 = _mm_extract_epi32(m2CO, 3);
651 
652  m2COshft = _mm_slli_si128 (m2CO, 4);
653  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
654 
655  m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
656  m2As = _mm_or_si128(m2As, m2COshft);
657 
658  co1 = co2;
659 
660  // we now have two shifted SSE4 regs with carry-over
661  m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
662  m2A = _mm_xor_si128(m2A, m2As);
663 
664 #ifdef BM64_SSE4
665  bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
666  bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
667  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
668 
669  m0 = _mm_extract_epi64(m2A, 0);
670  m1 = _mm_extract_epi64(m2A, 1);
671  count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
672 #else
673  bm::id_t m0 = _mm_extract_epi32(m1A, 0);
674  bm::id_t m1 = _mm_extract_epi32(m1A, 1);
675  bm::id_t m2 = _mm_extract_epi32(m1A, 2);
676  bm::id_t m3 = _mm_extract_epi32(m1A, 3);
677  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
678  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
679 
680  m0 = _mm_extract_epi32(m2A, 0);
681  m1 = _mm_extract_epi32(m2A, 1);
682  m2 = _mm_extract_epi32(m2A, 2);
683  m3 = _mm_extract_epi32(m2A, 3);
684  count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
685  _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
686 #endif
687 
688  }
689  count -= (w0 & 1u); // correct initial carry-in error
690  return count;
691 }
692 
693 
694 #ifdef __GNUG__
695 // necessary measure to silence false warning from GCC about negative pointer arithmetics
696 #pragma GCC diagnostic push
697 #pragma GCC diagnostic ignored "-Warray-bounds"
698 #endif
699 
700 /*!
701  SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
702  @ingroup SSE4
703  \internal
704 */
705 inline
706 unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
707 {
708  BM_ASSERT(size <= 16);
709  BM_ASSERT(size);
710 
711  const unsigned unroll_factor = 8;
712  if (size < 4) // for very short vector use conventional scan
713  {
714  unsigned j;
715  for (j = 0; j < size; ++j)
716  {
717  if (pbuf[j] >= pos)
718  break;
719  }
720  return j;
721  }
722 
723  __m128i m1, mz, maskF, maskFL;
724 
725  mz = _mm_setzero_si128();
726  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
727 
728  maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
729  maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
730  int shiftL= (64 - (unroll_factor - size) * 16);
731  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
732 
733  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
734  m1 = _mm_or_si128(m1, maskFL);
735 
736  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
737  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
738  __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
739  int mi = _mm_movemask_epi8(c_mask); // collect flag bits
740  if (mi)
741  {
742  // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
743  unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
744  return unroll_factor - bc; // address of first one element (target)
745  }
746  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
747  // GCC gives a false warning for "- unroll_factor" here
748  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
749  BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
750 
751  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
752  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
753  mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
754  unsigned bc = _mm_popcnt_u32(mi);
755 
756  return size - bc;
757 }
758 
759 /**
760  Experimental (test) function to do SIMD vector search (lower bound)
761  in sorted, growing array
762  @ingroup SSE4
763 
764  \internal
765 */
766 inline
767 int sse42_cmpge_u32(__m128i vect4, unsigned value)
768 {
769  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
770  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
771  //
772  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
773  __m128i mm_val = _mm_set1_epi32(value);
774 
775  __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
776  __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
777 
778  __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
779  __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
780 
781  __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
782  int mask = _mm_movemask_epi8(cmp_mask_ge);
783  if (mask)
784  {
785  int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
786  return bsf / 4;
787  }
788  return -1;
789 }
790 
791 
792 /**
793  lower bound (great or equal) linear scan in ascending order sorted array
794  @ingroup SSE4
795  \internal
796 */
797 inline
798 unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
799  unsigned target,
800  unsigned from,
801  unsigned to)
802 {
803  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
804  // see more at:
805  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
806 
807  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
808 
809  unsigned unroll_factor = 8;
810  unsigned len = to - from + 1;
811  unsigned len_unr = len - (len % unroll_factor);
812 
813  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
814  __m128i vect_target = _mm_set1_epi32(target);
815  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
816 
817  int mask;
818  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
819 
820  unsigned k = 0;
821  for (; k < len_unr; k+=unroll_factor)
822  {
823  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
824  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
825 
826  cmp_mask_ge = _mm_or_si128( // GT | EQ
827  _mm_cmpgt_epi32 (norm_vect40, norm_target),
828  _mm_cmpeq_epi32 (vect40, vect_target)
829  );
830  mask = _mm_movemask_epi8(cmp_mask_ge);
831  if (mask)
832  {
833  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
834  return from + k + (bsf / 4);
835  }
836  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
837  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
838 
839  cmp_mask_ge = _mm_or_si128(
840  _mm_cmpgt_epi32 (norm_vect41, norm_target),
841  _mm_cmpeq_epi32 (vect41, vect_target)
842  );
843  mask = _mm_movemask_epi8(cmp_mask_ge);
844  if (mask)
845  {
846  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
847  return 4 + from + k + (bsf / 4);
848  }
849  } // for
850 
851  for (; k < len; ++k)
852  {
853  if (arr_base[k] >= target)
854  return from + k;
855  }
856  return to + 1;
857 }
858 
859 
860 
861 /*!
862  SSE4.2 index lookup to check what belongs to the same block (8 elements)
863  \internal
864 */
865 inline
866 unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
867  unsigned nb, unsigned start)
868 {
869  const unsigned unroll_factor = 8;
870  const unsigned len = (size - start);
871  const unsigned len_unr = len - (len % unroll_factor);
872  unsigned k;
873 
874  idx += start;
875 
876  __m128i nbM = _mm_set1_epi32(nb);
877 
878  for (k = 0; k < len_unr; k+=unroll_factor)
879  {
880  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
881  __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
882  __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
883  __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
884 
885  if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
886  !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
887  break;
888 
889  } // for k
890  for (; k < len; ++k)
891  {
892  if (nb != unsigned(idx[k] >> bm::set_block_shift))
893  break;
894  }
895  return start + k;
896 }
897 
898 /*!
899  SSE4.2 bulk bit set
900  \internal
901 */
902 inline
904  const unsigned* BMRESTRICT idx,
905  unsigned start, unsigned stop )
906 {
907  const unsigned unroll_factor = 4;
908  const unsigned len = (stop - start);
909  const unsigned len_unr = len - (len % unroll_factor);
910 
911  idx += start;
912 
913  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
914  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
915 
916  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
917  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
918 
919  unsigned k = 0;
920  for (; k < len_unr; k+=unroll_factor)
921  {
922  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
923  __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
924  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
925 
926 
927  nbitA = _mm_and_si128 (nbitA, sw_mask);
928  _mm_store_si128 ((__m128i*)mshift_v, nbitA);
929 
930  // check-compare if all 4 bits are in the very same word
931  //
932  __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
933  __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
934  if (_mm_test_all_ones(cmpA)) // check if all are in one word
935  {
936  unsigned nword = _mm_extract_epi32(nwordA, 0);
937  block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
938  |(1u << mshift_v[2]) | (1u << mshift_v[3]);
939  }
940  else // bits are in different words, use scalar scatter
941  {
942  _mm_store_si128 ((__m128i*)mword_v, nwordA);
943 
944  block[mword_v[0]] |= (1u << mshift_v[0]);
945  block[mword_v[1]] |= (1u << mshift_v[1]);
946  block[mword_v[2]] |= (1u << mshift_v[2]);
947  block[mword_v[3]] |= (1u << mshift_v[3]);
948  }
949 
950  } // for k
951 
952  for (; k < len; ++k)
953  {
954  unsigned n = idx[k];
955  unsigned nbit = unsigned(n & bm::set_block_mask);
956  unsigned nword = nbit >> bm::set_word_shift;
957  nbit &= bm::set_word_mask;
958  block[nword] |= (1u << nbit);
959  } // for k
960 }
961 
962 
963 /*!
964  SSE4.2 bit block gather-scatter
965 
966  @param arr - destination array to set bits
967  @param blk - source bit-block
968  @param idx - gather index array
969  @param size - gather array size
970  @param start - gaher start index
971  @param bit_idx - bit to set in the target array
972 
973  \internal
974 
975  C algorithm:
976 
977  for (unsigned k = start; k < size; ++k)
978  {
979  nbit = unsigned(idx[k] & bm::set_block_mask);
980  nword = unsigned(nbit >> bm::set_word_shift);
981  mask0 = 1u << (nbit & bm::set_word_mask);
982  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
983  }
984 
985 */
986 inline
988  const unsigned* BMRESTRICT blk,
989  const unsigned* BMRESTRICT idx,
990  unsigned size,
991  unsigned start,
992  unsigned bit_idx)
993 {
994  const unsigned unroll_factor = 4;
995  const unsigned len = (size - start);
996  const unsigned len_unr = len - (len % unroll_factor);
997 
998  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
999  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1000  __m128i maskFF = _mm_set1_epi32(~0u);
1001  __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1002 
1003  __m128i mask_tmp, mask_0;
1004 
1005  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1006  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1007 
1008  unsigned k = 0;
1009  unsigned base = start + k;
1010  __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
1011  __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
1012  for (; k < len_unr; k+=unroll_factor)
1013  {
1014  __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
1015  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1016  // (nbit & bm::set_word_mask)
1017  _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1018  _mm_store_si128 ((__m128i*)mword_v, nwordA);
1019 
1020  // mask0 = 1u << (nbit & bm::set_word_mask);
1021  //
1022 #if 0
1023  // ifdefed an alternative SHIFT implementation using SSE and masks
1024  // (it is not faster than just doing scalar operations)
1025  {
1026  __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1027  __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1028  mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1029  mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1030  mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1031 
1032  __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1033  _mm_slli_si128 (am_0, 8));
1034  mask_tmp = _mm_and_si128 (
1035  _mm_slli_epi32(mask1, mshift_v[3]),
1036  _mm_slli_si128 (am_0, 12)
1037  );
1038 
1039  mask_0 = _mm_or_si128 (mask_0,
1040  _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1041  }
1042 #endif
1043  mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1044 
1045 
1046  // gather for: blk[nword] (.. & mask0 )
1047  //
1048  mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1049  blk[mword_v[1]], blk[mword_v[0]]),
1050  mask_0);
1051 
1052  // bool(blk[nword] ...)
1053  //maskFF = _mm_set1_epi32(~0u);
1054  mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1055  mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1056  mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1057 
1058  mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1059 
1060  _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1061  _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1062 
1063  ++idx_ptr; ++target_ptr;
1064  _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1065  }
1066 
1067  for (; k < len; ++k)
1068  {
1069  base = start + k;
1070  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1071  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1072  }
1073 
1074 }
1075 
1076 /*!
1077  @brief block shift left by 1
1078  @ingroup SSE4
1079 */
1080 inline
1081 bool sse42_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1)
1082 {
1083  __m128i* block_end =
1084  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1085  __m128i mAcc = _mm_set1_epi32(0);
1086  __m128i mMask1 = _mm_set1_epi32(1);
1087 
1088  unsigned co2;
1089  for (--block_end; block_end >= block; block_end -= 2)
1090  {
1091  __m128i m1A = _mm_load_si128(block_end);
1092  __m128i m2A = _mm_load_si128(block_end-1);
1093 
1094  __m128i m1CO = _mm_and_si128(m1A, mMask1);
1095  __m128i m2CO = _mm_and_si128(m2A, mMask1);
1096 
1097  co2 = _mm_extract_epi32(m1CO, 0);
1098 
1099  m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1100  m2A = _mm_srli_epi32(m2A, 1);
1101 
1102  __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1103  __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1104  m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1105  m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1106  m1COshft = _mm_slli_epi32(m1COshft, 31);
1107  m2COshft = _mm_slli_epi32(m2COshft, 31);
1108 
1109  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1110  m2A = _mm_or_si128(m2A, m2COshft);
1111 
1112  co1 = _mm_extract_epi32(m2CO, 0);
1113 
1114  _mm_store_si128(block_end, m1A);
1115  _mm_store_si128(block_end-1, m2A);
1116 
1117  mAcc = _mm_or_si128(mAcc, m1A);
1118  mAcc = _mm_or_si128(mAcc, m2A);
1119  } // for
1120 
1121  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1122  return co1;
1123 }
1124 
1125 
1126 /*!
1127  @brief block shift right by 1
1128  @ingroup SSE4
1129 */
1130 inline
1131 bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1)
1132 {
1133  __m128i* block_end =
1134  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1135  __m128i m1COshft, m2COshft;
1136  __m128i mAcc = _mm_set1_epi32(0);
1137 
1138  unsigned co2;
1139  for (;block < block_end; block += 2)
1140  {
1141  __m128i m1A = _mm_load_si128(block);
1142  __m128i m2A = _mm_load_si128(block+1);
1143 
1144  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1145  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1146 
1147  co2 = _mm_extract_epi32(m1CO, 3);
1148 
1149  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1150  m2A = _mm_slli_epi32(m2A, 1);
1151 
1152  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1153  m2COshft = _mm_slli_si128 (m2CO, 4);
1154  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1155  m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1156 
1157  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1158  m2A = _mm_or_si128(m2A, m2COshft);
1159 
1160  co1 = _mm_extract_epi32(m2CO, 3);
1161 
1162  _mm_store_si128(block, m1A);
1163  _mm_store_si128(block+1, m2A);
1164 
1165  mAcc = _mm_or_si128(mAcc, m1A);
1166  mAcc = _mm_or_si128(mAcc, m2A);
1167  }
1168  *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1169  return co1;
1170 }
1171 
1172 
1173 
1174 /*!
1175  @brief block shift right by 1 plus AND
1176 
1177  @return carry over flag
1178  @ingroup SSE4
1179 */
1180 inline
1181 bool sse42_shift_r1_and(__m128i* block,
1182  bm::word_t co1,
1183  const __m128i* BMRESTRICT mask_block,
1184  bm::id64_t* digest)
1185 {
1186  bm::word_t* wblock = (bm::word_t*) block;
1187  const bm::word_t* mblock = (const bm::word_t*) mask_block;
1188 
1189  __m128i m1COshft, m2COshft;
1190  __m128i mAcc = _mm_set1_epi32(0);
1191  unsigned co2;
1192 
1193  bm::id64_t d, wd;
1194  wd = d = *digest;
1195 
1196  unsigned di = 0;
1197  if (!co1)
1198  {
1199  bm::id64_t t = d & -d;
1200 #ifdef BM64_SSE4
1201  di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1202 #else
1203  bm::id_t t32 = t & bm::id_max;
1204  if (t32 == 0) {
1205  di = 32;
1206  t32 = t >> 32;
1207  }
1208  di += unsigned(_mm_popcnt_u32(t32 - 1));
1209 #endif
1210  }
1211 
1212  for (; di < 64 ; ++di)
1213  {
1214  const unsigned d_base = di * bm::set_block_digest_wave_size;
1215  bm::id64_t dmask = (1ull << di);
1216  if (d & dmask) // digest stride NOT empty
1217  {
1218  block = (__m128i*) &wblock[d_base];
1219  mask_block = (__m128i*) &mblock[d_base];
1220  mAcc = _mm_xor_si128(mAcc, mAcc); // mAcc = 0
1221  for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1222  {
1223  __m128i m1A = _mm_load_si128(block);
1224  __m128i m2A = _mm_load_si128(block+1);
1225 
1226  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1227  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1228 
1229  co2 = _mm_extract_epi32(m1CO, 3);
1230 
1231  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1232  m2A = _mm_slli_epi32(m2A, 1);
1233 
1234  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1235  m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1236 
1237  co1 = co2;
1238 
1239  co2 = _mm_extract_epi32(m2CO, 3);
1240 
1241  m2COshft = _mm_slli_si128 (m2CO, 4);
1242  m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1243 
1244  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1245  m2A = _mm_or_si128(m2A, m2COshft);
1246 
1247  m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1248  m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1249 
1250  mAcc = _mm_or_si128(mAcc, m1A);
1251  mAcc = _mm_or_si128(mAcc, m2A);
1252 
1253  _mm_store_si128(block, m1A);
1254  _mm_store_si128(block+1, m2A);
1255 
1256  co1 = co2;
1257 
1258  } // for i
1259 
1260  if (_mm_testz_si128(mAcc, mAcc))
1261  d &= ~dmask; // clear digest bit
1262  wd &= wd - 1;
1263  }
1264  else
1265  {
1266  if (co1)
1267  {
1268  BM_ASSERT(co1 == 1);
1269  BM_ASSERT(wblock[d_base] == 0);
1270 
1271  bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1272  d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1273  co1 = 0;
1274  }
1275  if (!wd) // digest is empty, no CO -> exit
1276  break;
1277  }
1278  } // for di
1279 
1280  *digest = d;
1281  return co1;
1282 }
1283 
1284 
1285 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1286  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1287 
1288 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1289  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1290 
1291 #define VECT_BITCOUNT(first, last) \
1292  sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1293 
1294 #define VECT_BITCOUNT_AND(first, last, mask) \
1295  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1296 
1297 #define VECT_BITCOUNT_OR(first, last, mask) \
1298  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1299 
1300 #define VECT_BITCOUNT_XOR(first, last, mask) \
1301  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1302 
1303 #define VECT_BITCOUNT_SUB(first, last, mask) \
1304  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1305 
1306 #define VECT_INVERT_BLOCK(first) \
1307  sse2_invert_block((__m128i*)first);
1308 
1309 #define VECT_AND_BLOCK(dst, src) \
1310  sse4_and_block((__m128i*) dst, (__m128i*) (src))
1311 
1312 #define VECT_AND_DIGEST(dst, src) \
1313  sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1314 
1315 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1316  sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1317 
1318 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1319  sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1320 
1321 #define VECT_OR_BLOCK(dst, src) \
1322  sse2_or_block((__m128i*) dst, (__m128i*) (src))
1323 
1324 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1325  sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1326 
1327 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1328  sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1329 
1330 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1331  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1332 
1333 #define VECT_SUB_BLOCK(dst, src) \
1334  sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1335 
1336 #define VECT_SUB_DIGEST(dst, src) \
1337  sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1338 
1339 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1340  sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1341 
1342 #define VECT_XOR_BLOCK(dst, src) \
1343  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1344 
1345 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1346  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1347 
1348 #define VECT_COPY_BLOCK(dst, src) \
1349  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1350 
1351 #define VECT_STREAM_BLOCK(dst, src) \
1352  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1353 
1354 #define VECT_SET_BLOCK(dst, value) \
1355  sse2_set_block((__m128i*) dst, value)
1356 
1357 #define VECT_IS_ZERO_BLOCK(dst) \
1358  sse4_is_all_zero((__m128i*) dst)
1359 
1360 #define VECT_IS_ONE_BLOCK(dst) \
1361  sse4_is_all_one((__m128i*) dst)
1362 
1363 #define VECT_IS_DIGEST_ZERO(start) \
1364  sse4_is_digest_zero((__m128i*)start)
1365 
1366 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1367  sse4_block_set_digest((__m128i*)dst, val)
1368 
1369 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1370  sse4_lower_bound_scan_u32(arr, target, from, to)
1371 
1372 #define VECT_SHIFT_L1(b, acc, co) \
1373  sse42_shift_l1((__m128i*)b, acc, co)
1374 
1375 #define VECT_SHIFT_R1(b, acc, co) \
1376  sse42_shift_r1((__m128i*)b, acc, co)
1377 
1378 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
1379  sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1380 
1381 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1382  sse42_idx_arr_block_lookup(idx, size, nb, start)
1383 
1384 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1385  sse42_set_block_bits(block, idx, start, stop)
1386 
1387 #define VECT_BLOCK_CHANGE(block) \
1388  sse42_bit_block_calc_change((__m128i*)block)
1389 
1390 
1391 #ifdef __GNUG__
1392 #pragma GCC diagnostic pop
1393 #endif
1394 
1395 #ifdef _MSC_VER
1396 #pragma warning( pop )
1397 #endif
1398 
1399 } // namespace
1400 
1401 
1402 
1403 
1404 #endif
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 waves of pointers are all NULL
Definition: bmsse4.h:595
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse4.h:798
const unsigned set_block_size
Definition: bmconst.h:54
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse4.h:78
const unsigned set_word_shift
Definition: bmconst.h:70
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
Definition: bmsse4.h:1131
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:175
unsigned long long int id64_t
Definition: bmconst.h:34
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:332
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block)
Definition: bmsse4.h:622
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse4.h:133
Definition: bm.h:76
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if wave of 2 pointers are the same (null or FULL)
Definition: bmsse4.h:608
#define BM_ALIGN16
Definition: bmdef.h:263
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:559
Compute functions for SSE SIMD instruction set (internal)
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition: bmsse4.h:903
const unsigned id_max
Definition: bmconst.h:105
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition: bmsse4.h:126
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmsse4.h:584
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmsse4.h:866
unsigned int word_t
Definition: bmconst.h:38
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array...
Definition: bmsse4.h:767
bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse4.h:511
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse4.h:706
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition: bmsse4.h:117
unsigned short gap_word_t
Definition: bmconst.h:76
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmsse4.h:284
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
Definition: bmsse4.h:379
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift left by 1
Definition: bmsse4.h:1081
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmsse4.h:987
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmsse4.h:200
Definitions(internal)
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:462
#define BM_ALIGN16ATTR
Definition: bmdef.h:264
const unsigned set_block_mask
Definition: bmconst.h:56
void sse4_block_set_digest(__m128i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse4.h:219
const unsigned set_word_mask
Definition: bmconst.h:71
#define BMFORCEINLINE
Definition: bmdef.h:190
unsigned int id_t
Definition: bmconst.h:37
#define BM_ASSERT
Definition: bmdef.h:117
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition: bmsse4.h:107
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
Definition: bmsse4.h:1181
const unsigned set_block_shift
Definition: bmconst.h:55
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse4.h:237
Bit manipulation primitives (internal)
#define BMRESTRICT
Definition: bmdef.h:180
const unsigned set_block_digest_wave_size
Definition: bmconst.h:65