BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1 #ifndef BMSSE4__H__INCLUDED__
2 #define BMSSE4__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse4.h
22  \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23 */
24 
25 #include<mmintrin.h>
26 #include<emmintrin.h>
27 #include<smmintrin.h>
28 #include<nmmintrin.h>
29 #include<immintrin.h>
30 
31 #include "bmdef.h"
32 #include "bmsse_util.h"
33 #include "bmutil.h"
34 
35 namespace bm
36 {
37 
38 /** @defgroup SSE4 SSE4.2 funcions (internal)
39  Processor specific optimizations for SSE4.2 instructions (internals)
40  @internal
41  @ingroup bvector
42  */
43 
44 #ifdef __GNUG__
45 #pragma GCC diagnostic push
46 #pragma GCC diagnostic ignored "-Wconversion"
47 #endif
48 
49 
50 
51 /*!
52  SSE4.2 optimized bitcounting .
53  @ingroup SSE4
54 */
55 inline
56 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
57 {
58  bm::id_t count = 0;
59 #ifdef BM64_SSE4
60  const bm::id64_t* b = (bm::id64_t*) block;
61  const bm::id64_t* b_end = (bm::id64_t*) block_end;
62  do
63  {
64  count += unsigned( _mm_popcnt_u64(b[0]) +
65  _mm_popcnt_u64(b[1]));
66  b += 2;
67  } while (b < b_end);
68 #else
69  do
70  {
71  const unsigned* b = (unsigned*) block;
72  count += _mm_popcnt_u32(b[0]) +
73  _mm_popcnt_u32(b[1]) +
74  _mm_popcnt_u32(b[2]) +
75  _mm_popcnt_u32(b[3]);
76  } while (++block < block_end);
77 #endif
78  return count;
79 }
80 
81 /*!
82 \internal
83 */
85 unsigned op_xor(unsigned a, unsigned b)
86 {
87  unsigned ret = (a ^ b);
88  return ret;
89 }
90 
91 /*!
92 \internal
93 */
95 unsigned op_or(unsigned a, unsigned b)
96 {
97  return (a | b);
98 }
99 
100 /*!
101 \internal
102 */
104 unsigned op_and(unsigned a, unsigned b)
105 {
106  return (a & b);
107 }
108 
109 
110 template<class Func>
111 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
112  const __m128i* BMRESTRICT block_end,
113  const __m128i* BMRESTRICT mask_block,
114  Func sse2_func)
115 {
116  bm::id_t count = 0;
117 #ifdef BM64_SSE4
118  do
119  {
120  __m128i tmp0 = _mm_load_si128(block);
121  __m128i tmp1 = _mm_load_si128(mask_block);
122  __m128i b = sse2_func(tmp0, tmp1);
123 
124  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
125  count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
126 
127  ++block; ++mask_block;
128  } while (block < block_end);
129 #else
130  do
131  {
132  __m128i tmp0 = _mm_load_si128(block);
133  __m128i tmp1 = _mm_load_si128(mask_block);
134  __m128i b = sse2_func(tmp0, tmp1);
135 
136  count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
137  count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
138  count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
139  count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
140 
141  ++block; ++mask_block;
142  } while (block < block_end);
143 #endif
144 
145  return count;
146 }
147 
148 /*!
149  @brief check if block is all zero bits
150  @ingroup SSE4
151 */
152 inline
153 bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
154 {
155  __m128i w;
156  __m128i maskz = _mm_setzero_si128();
157  const __m128i* BMRESTRICT block_end =
158  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
159 
160  do
161  {
162  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
163  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
164  return false;
165  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
166  if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
167  return false;
168  block += 4;
169  } while (block < block_end);
170  return true;
171 }
172 
173 /*!
174  @brief check if digest stride is all zero bits
175  @ingroup SSE4
176 */
177 inline
178 bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
179 {
180  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
181  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
182  wA = _mm_or_si128(wA, wB);
183  bool z1 = _mm_test_all_zeros(wA, wA);
184 
185  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
186  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
187  wA = _mm_or_si128(wA, wB);
188  bool z2 = _mm_test_all_zeros(wA, wA);
189  return z1 & z2;
190 }
191 
192 
193 /*!
194  @brief AND blocks2
195  *dst &= *src
196 
197  @return 0 if no bits were set
198 
199  @ingroup SSE4
200 */
201 inline
202 unsigned sse4_and_block(__m128i* BMRESTRICT dst,
203  const __m128i* BMRESTRICT src)
204 {
205  __m128i m1A, m1B, m1C, m1D;
206  __m128i accA, accB, accC, accD;
207 
208  const __m128i* BMRESTRICT src_end =
209  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
210 
211  accA = accB = accC = accD = _mm_setzero_si128();
212 
213  do
214  {
215  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
216  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
217  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
218  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
219 
220  _mm_store_si128(dst+0, m1A);
221  _mm_store_si128(dst+1, m1B);
222  _mm_store_si128(dst+2, m1C);
223  _mm_store_si128(dst+3, m1D);
224 
225  accA = _mm_or_si128(accA, m1A);
226  accB = _mm_or_si128(accB, m1B);
227  accC = _mm_or_si128(accC, m1C);
228  accD = _mm_or_si128(accD, m1D);
229 
230  src += 4; dst += 4;
231  } while (src < src_end);
232 
233  accA = _mm_or_si128(accA, accB); // A = A | B
234  accC = _mm_or_si128(accC, accD); // C = C | D
235  accA = _mm_or_si128(accA, accC); // A = A | C
236 
237  return !_mm_testz_si128(accA, accA);
238 }
239 
240 
241 /*!
242  @brief AND block digest stride
243  *dst &= *src
244 
245  @return true if stide is all zero
246  @ingroup SSE4
247 */
248 inline
249 bool sse4_and_digest(__m128i* BMRESTRICT dst,
250  const __m128i* BMRESTRICT src)
251 {
252  __m128i m1A, m1B, m1C, m1D;
253 
254  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
255  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
256  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
257  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
258 
259  _mm_store_si128(dst+0, m1A);
260  _mm_store_si128(dst+1, m1B);
261  _mm_store_si128(dst+2, m1C);
262  _mm_store_si128(dst+3, m1D);
263 
264  m1A = _mm_or_si128(m1A, m1B);
265  m1C = _mm_or_si128(m1C, m1D);
266  m1A = _mm_or_si128(m1A, m1C);
267 
268  bool z1 = _mm_testz_si128(m1A, m1A);
269 
270  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
271  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
272  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
273  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
274 
275  _mm_store_si128(dst+4, m1A);
276  _mm_store_si128(dst+5, m1B);
277  _mm_store_si128(dst+6, m1C);
278  _mm_store_si128(dst+7, m1D);
279 
280  m1A = _mm_or_si128(m1A, m1B);
281  m1C = _mm_or_si128(m1C, m1D);
282  m1A = _mm_or_si128(m1A, m1C);
283 
284  bool z2 = _mm_testz_si128(m1A, m1A);
285 
286  return z1 & z2;
287 }
288 
289 /*!
290  @brief AND block digest stride
291  *dst = *src1 & src2
292 
293  @return true if stide is all zero
294  @ingroup SSE4
295 */
296 inline
298  const __m128i* BMRESTRICT src1,
299  const __m128i* BMRESTRICT src2)
300 {
301  __m128i m1A, m1B, m1C, m1D;
302 
303  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
304  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
305  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
306  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
307 
308  _mm_store_si128(dst+0, m1A);
309  _mm_store_si128(dst+1, m1B);
310  _mm_store_si128(dst+2, m1C);
311  _mm_store_si128(dst+3, m1D);
312 
313  m1A = _mm_or_si128(m1A, m1B);
314  m1C = _mm_or_si128(m1C, m1D);
315  m1A = _mm_or_si128(m1A, m1C);
316 
317  bool z1 = _mm_testz_si128(m1A, m1A);
318 
319  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
320  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
321  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
322  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
323 
324  _mm_store_si128(dst+4, m1A);
325  _mm_store_si128(dst+5, m1B);
326  _mm_store_si128(dst+6, m1C);
327  _mm_store_si128(dst+7, m1D);
328 
329  m1A = _mm_or_si128(m1A, m1B);
330  m1C = _mm_or_si128(m1C, m1D);
331  m1A = _mm_or_si128(m1A, m1C);
332 
333  bool z2 = _mm_testz_si128(m1A, m1A);
334 
335  return z1 & z2;
336 }
337 
338 /*!
339  @brief SUB (AND NOT) block digest stride
340  *dst &= ~*src
341 
342  @return true if stide is all zero
343  @ingroup SSE4
344 */
345 inline
346 bool sse4_sub_digest(__m128i* BMRESTRICT dst,
347  const __m128i* BMRESTRICT src)
348 {
349  __m128i m1A, m1B, m1C, m1D;
350 
351  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
352  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
353  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
354  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
355 
356  _mm_store_si128(dst+0, m1A);
357  _mm_store_si128(dst+1, m1B);
358  _mm_store_si128(dst+2, m1C);
359  _mm_store_si128(dst+3, m1D);
360 
361  m1A = _mm_or_si128(m1A, m1B);
362  m1C = _mm_or_si128(m1C, m1D);
363  m1A = _mm_or_si128(m1A, m1C);
364 
365  bool z1 = _mm_testz_si128(m1A, m1A);
366 
367  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
368  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
369  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
370  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
371 
372  _mm_store_si128(dst+4, m1A);
373  _mm_store_si128(dst+5, m1B);
374  _mm_store_si128(dst+6, m1C);
375  _mm_store_si128(dst+7, m1D);
376 
377  m1A = _mm_or_si128(m1A, m1B);
378  m1C = _mm_or_si128(m1C, m1D);
379  m1A = _mm_or_si128(m1A, m1C);
380 
381  bool z2 = _mm_testz_si128(m1A, m1A);
382 
383  return z1 & z2;
384 }
385 
386 
387 
388 /*!
389  @brief check if block is all zero bits
390  @ingroup SSE4
391 */
392 inline
393 bool sse4_is_all_one(const __m128i* BMRESTRICT block)
394 {
395  __m128i w;
396  const __m128i* BMRESTRICT block_end =
397  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
398 
399  do
400  {
401  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
402  if (!_mm_test_all_ones(w))
403  return false;
404  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
405  if (!_mm_test_all_ones(w))
406  return false;
407 
408  block+=4;
409  } while (block < block_end);
410  return true;
411 }
412 
413 /*!
414  @brief check if wave of pointers is all NULL
415  @ingroup AVX2
416 */
418 bool sse42_test_all_zero_wave(const void* ptr)
419 {
420  __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
421  return _mm_testz_si128(w0, w0);
422 }
423 
424 
425 
426 /*!
427  SSE4.2 optimized bitcounting and number of GAPs
428  @ingroup SSE4
429 */
430 inline
432  const __m128i* BMRESTRICT block_end,
433  unsigned* BMRESTRICT bit_count)
434 {
435  int count = (unsigned)(block_end - block)*4;
436 
437  bm::word_t w0, w_prev;
438  const int w_shift = sizeof(w0) * 8 - 1;
439  bool first_word = true;
440  *bit_count = 0;
441 
442  // first word
443  {
444  bm::word_t w;
445  const bm::word_t* blk = (const bm::word_t*) block;
446  w = w0 = blk[0];
447  *bit_count += _mm_popcnt_u32(w);
448  w ^= (w >> 1);
449  count += _mm_popcnt_u32(w);
450  count -= (w_prev = (w0 >> w_shift));
451  }
452 
453  do
454  {
455  __m128i b = _mm_load_si128(block);
456  __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); // tmp2=(b >> 1) ^ b;
457  __m128i tmp3 = _mm_srli_epi32(b, w_shift); // tmp3 = w0 >> w_shift
458 // __m128i tmp4 = _mm_and_si128(b, mask1); // tmp4 = w0 & 1
459 
460  // ---------------------------------------------------------------------
461  {
462  if (first_word)
463  {
464  first_word = false;
465  }
466  else
467  {
468  w0 = _mm_extract_epi32(b, 0);
469  if (w0)
470  {
471  *bit_count += _mm_popcnt_u32(w0);
472  count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
473  count -= !(w_prev ^ (w0 & 1));
474  count -= w_prev = _mm_extract_epi32(tmp3, 0);
475  }
476  else
477  {
478  count -= !w_prev; w_prev ^= w_prev;
479  }
480  }
481  w0 = _mm_extract_epi32(b, 1);
482  if (w0)
483  {
484  *bit_count += _mm_popcnt_u32(w0);
485  count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
486  count -= !(w_prev ^ (w0 & 1));
487  count -= w_prev = _mm_extract_epi32(tmp3, 1);
488  }
489  else
490  {
491  count -= !w_prev; w_prev ^= w_prev;
492  }
493  w0 = _mm_extract_epi32(b, 2);
494  if (w0)
495  {
496  *bit_count += _mm_popcnt_u32(w0);
497  count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
498  count -= !(w_prev ^ (w0 & 1));
499  count -= w_prev = _mm_extract_epi32(tmp3, 2);
500  }
501  else
502  {
503  count -= !w_prev; w_prev ^= w_prev;
504  }
505  w0 = _mm_extract_epi32(b, 3);
506  if (w0)
507  {
508  *bit_count += _mm_popcnt_u32(w0);
509  count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
510  count -= !(w_prev ^ (w0 & 1));
511  count -= w_prev = _mm_extract_epi32(tmp3, 3);
512  }
513  else
514  {
515  count -= !w_prev; w_prev ^= w_prev;
516  }
517  }
518  } while (++block < block_end);
519 
520  return count;
521 }
522 
523 
524 
525 #ifdef __GNUG__
526 // necessary measure to silence false warning from GCC about negative pointer arithmetics
527 #pragma GCC diagnostic push
528 #pragma GCC diagnostic ignored "-Warray-bounds"
529 #endif
530 
531 /*!
532  SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
533  @ingroup SSE4
534  \internal
535 */
536 inline
537 unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
538 {
539  BM_ASSERT(size <= 16);
540  BM_ASSERT(size);
541 
542  const unsigned unroll_factor = 8;
543  if (size < 4) // for very short vector use conventional scan
544  {
545  unsigned j;
546  for (j = 0; j < size; ++j)
547  {
548  if (pbuf[j] >= pos)
549  break;
550  }
551  return j;
552  }
553 
554  __m128i m1, mz, maskF, maskFL;
555 
556  mz = _mm_setzero_si128();
557  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
558 
559  maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
560  maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
561  int shiftL= (64 - (unroll_factor - size) * 16);
562  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
563 
564  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
565  m1 = _mm_or_si128(m1, maskFL);
566 
567  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
568  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
569  __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
570  int mi = _mm_movemask_epi8(c_mask); // collect flag bits
571  if (mi)
572  {
573  // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
574  unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
575  return unroll_factor - bc; // address of first one element (target)
576  }
577  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
578  // GCC gives a false warning for "- unroll_factor" here
579  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
580  BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
581 
582  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
583  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
584  mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
585  unsigned bc = _mm_popcnt_u32(mi);
586 
587  return size - bc;
588 }
589 
590 /**
591  Experimental (test) function to do SIMD vector search (lower bound)
592  in sorted, growing array
593  @ingroup SSE4
594 
595  \internal
596 */
597 inline
598 int sse42_cmpge_u32(__m128i vect4, unsigned value)
599 {
600  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
601  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
602  //
603  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
604  __m128i mm_val = _mm_set1_epi32(value);
605 
606  __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
607  __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
608 
609  __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
610  __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
611 
612  __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
613  int mask = _mm_movemask_epi8(cmp_mask_ge);
614  if (mask)
615  {
616  int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
617  return bsf / 4;
618  }
619  return -1;
620 }
621 
622 
623 /**
624  lower bound (great or equal) linear scan in ascending order sorted array
625  @ingroup SSE4
626  \internal
627 */
628 inline
629 unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
630  unsigned target,
631  unsigned from,
632  unsigned to)
633 {
634  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
635  // see more at:
636  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
637 
638  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
639 
640  unsigned unroll_factor = 8;
641  unsigned len = to - from + 1;
642  unsigned len_unr = len - (len % unroll_factor);
643 
644  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
645  __m128i vect_target = _mm_set1_epi32(target);
646  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
647 
648  int mask;
649  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
650 
651  unsigned k = 0;
652  for (; k < len_unr; k+=unroll_factor)
653  {
654  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
655  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
656 
657  cmp_mask_ge = _mm_or_si128( // GT | EQ
658  _mm_cmpgt_epi32 (norm_vect40, norm_target),
659  _mm_cmpeq_epi32 (vect40, vect_target)
660  );
661  mask = _mm_movemask_epi8(cmp_mask_ge);
662  if (mask)
663  {
664  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
665  return from + k + (bsf / 4);
666  }
667  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
668  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
669 
670  cmp_mask_ge = _mm_or_si128(
671  _mm_cmpgt_epi32 (norm_vect41, norm_target),
672  _mm_cmpeq_epi32 (vect41, vect_target)
673  );
674  mask = _mm_movemask_epi8(cmp_mask_ge);
675  if (mask)
676  {
677  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
678  return 4 + from + k + (bsf / 4);
679  }
680  } // for
681 
682  for (; k < len; ++k)
683  {
684  if (arr_base[k] >= target)
685  return from + k;
686  }
687  return to + 1;
688 }
689 
690 
691 
692 /*!
693  SSE4.2 index lookup to check what belongs to the same block (8 elements)
694  \internal
695 */
696 inline
697 unsigned sse4_idx_arr_block_lookup(const unsigned* idx, unsigned size,
698  unsigned nb, unsigned start)
699 {
700  const unsigned unroll_factor = 8;
701  const unsigned len = (size - start);
702  const unsigned len_unr = len - (len % unroll_factor);
703  unsigned k;
704 
705  idx += start;
706 
707  __m128i nbM = _mm_set1_epi32(nb);
708 
709  for (k = 0; k < len_unr; k+=unroll_factor)
710  {
711  __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
712  __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
713  __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
714  __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
715 
716  if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
717  !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
718  break;
719 
720  } // for k
721  for (; k < len; ++k)
722  {
723  if (nb != unsigned(idx[k] >> bm::set_block_shift))
724  break;
725  }
726  return start + k;
727 }
728 
729 /*!
730  SSE4.2 bit block gather-scatter
731 
732  @param arr - destination array to set bits
733  @param blk - source bit-block
734  @param idx - gather index array
735  @param size - gather array size
736  @param start - gaher start index
737  @param bit_idx - bit to set in the target array
738 
739  \internal
740 
741  C algorithm:
742 
743  for (unsigned k = start; k < size; ++k)
744  {
745  nbit = unsigned(idx[k] & bm::set_block_mask);
746  nword = unsigned(nbit >> bm::set_word_shift);
747  mask0 = 1u << (nbit & bm::set_word_mask);
748  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
749  }
750 
751 */
752 inline
754  const unsigned* BMRESTRICT blk,
755  const unsigned* BMRESTRICT idx,
756  unsigned size,
757  unsigned start,
758  unsigned bit_idx)
759 {
760  const unsigned unroll_factor = 4;
761  const unsigned len = (size - start);
762  const unsigned len_unr = len - (len % unroll_factor);
763 
764  __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
765  __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
766  __m128i maskFF = _mm_set1_epi32(~0u);
767  __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
768 
769  __m128i mask_tmp, mask_0;
770 
771  unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
772  unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
773 
774  unsigned k = 0;
775  unsigned base = start + k;
776  __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
777  __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
778  for (; k < len_unr; k+=unroll_factor)
779  {
780  __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
781  __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
782  // (nbit & bm::set_word_mask)
783  _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
784  _mm_store_si128 ((__m128i*)mword_v, nwordA);
785 
786  // mask0 = 1u << (nbit & bm::set_word_mask);
787  //
788 #if 0
789  // ifdefed an alternative SHIFT implementation using SSE and masks
790  // (it is not faster than just doing scalar operations)
791  {
792  __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
793  __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
794  mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
795  mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
796  mask_0 = _mm_or_si128 (mask_0, mask_tmp);
797 
798  __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
799  _mm_slli_si128 (am_0, 8));
800  mask_tmp = _mm_and_si128 (
801  _mm_slli_epi32(mask1, mshift_v[3]),
802  _mm_slli_si128 (am_0, 12)
803  );
804 
805  mask_0 = _mm_or_si128 (mask_0,
806  _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
807  }
808 #endif
809  mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
810 
811 
812  // gather for: blk[nword] (.. & mask0 )
813  //
814  mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
815  blk[mword_v[1]], blk[mword_v[0]]),
816  mask_0);
817 
818  // bool(blk[nword] ...)
819  //maskFF = _mm_set1_epi32(~0u);
820  mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
821  mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
822  mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
823 
824  mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
825 
826  _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
827  _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
828 
829  ++idx_ptr; ++target_ptr;
830  _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
831  }
832 
833  for (; k < len; ++k)
834  {
835  base = start + k;
836  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
837  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
838  }
839 
840 }
841 
842 
843 
844 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
845  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
846 
847 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
848  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
849 
850 #define VECT_BITCOUNT(first, last) \
851  sse4_bit_count((__m128i*) (first), (__m128i*) (last))
852 
853 #define VECT_BITCOUNT_AND(first, last, mask) \
854  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
855 
856 #define VECT_BITCOUNT_OR(first, last, mask) \
857  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
858 
859 #define VECT_BITCOUNT_XOR(first, last, mask) \
860  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
861 
862 #define VECT_BITCOUNT_SUB(first, last, mask) \
863  sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
864 
865 #define VECT_INVERT_BLOCK(first) \
866  sse2_invert_block((__m128i*)first);
867 
868 #define VECT_AND_BLOCK(dst, src) \
869  sse4_and_block((__m128i*) dst, (__m128i*) (src))
870 
871 #define VECT_AND_DIGEST(dst, src) \
872  sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
873 
874 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
875  sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
876 
877 #define VECT_OR_BLOCK(dst, src) \
878  sse2_or_block((__m128i*) dst, (__m128i*) (src))
879 
880 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
881  sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
882 
883 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
884  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
885 
886 #define VECT_SUB_BLOCK(dst, src) \
887  sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
888 
889 #define VECT_SUB_DIGEST(dst, src) \
890  sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
891 
892 #define VECT_XOR_ARR(dst, src, src_end) \
893  sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
894 
895 #define VECT_COPY_BLOCK(dst, src) \
896  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
897 
898 #define VECT_SET_BLOCK(dst, value) \
899  sse2_set_block((__m128i*) dst, value)
900 
901 #define VECT_IS_ZERO_BLOCK(dst) \
902  sse4_is_all_zero((__m128i*) dst)
903 
904 #define VECT_IS_ONE_BLOCK(dst) \
905  sse4_is_all_one((__m128i*) dst)
906 
907 #define VECT_IS_DIGEST_ZERO(start) \
908  sse4_is_digest_zero((__m128i*)start)
909 
910 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
911  sse4_lower_bound_scan_u32(arr, target, from, to)
912 
913 
914 #ifdef __GNUG__
915 #pragma GCC diagnostic pop
916 #endif
917 
918 
919 #ifdef __GNUG__
920 #pragma GCC diagnostic pop
921 #endif
922 
923 
924 } // namespace
925 
926 
927 
928 
929 #endif
bm::id_t sse4_bit_block_calc_count_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, unsigned *BMRESTRICT bit_count)
Definition: bmsse4.h:431
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse4.h:629
const unsigned set_block_size
Definition: bmconst.h:47
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse4.h:56
const unsigned set_word_shift
Definition: bmconst.h:59
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:153
unsigned long long int id64_t
Definition: bmconst.h:31
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition: bmsse4.h:297
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition: bmsse4.h:111
Definition: bm.h:70
#define BM_ALIGN16
Definition: bmdef.h:273
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition: bmsse4.h:393
Compute functions for SSE SIMD instruction set (internal)
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition: bmsse4.h:104
unsigned int word_t
Definition: bmconst.h:35
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array...
Definition: bmsse4.h:598
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition: bmsse4.h:537
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmsse4.h:418
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition: bmsse4.h:95
unsigned short gap_word_t
Definition: bmconst.h:65
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition: bmsse4.h:249
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmsse4.h:753
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition: bmsse4.h:178
Definitions(internal)
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse4.h:346
#define BM_ALIGN16ATTR
Definition: bmdef.h:274
unsigned sse4_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmsse4.h:697
const unsigned set_block_mask
Definition: bmconst.h:49
const unsigned set_word_mask
Definition: bmconst.h:60
#define BMFORCEINLINE
Definition: bmdef.h:189
unsigned int id_t
Definition: bmconst.h:34
#define BM_ASSERT
Definition: bmdef.h:116
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition: bmsse4.h:85
const unsigned set_block_shift
Definition: bmconst.h:48
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition: bmsse4.h:202
Bit manipulation primitives (internal)
#define BMRESTRICT
Definition: bmdef.h:179