FD.io VPP  v20.05-21-gb1500e9ff
Vector Packet Processing
vector_sse42.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* *INDENT-OFF* */
45 #define foreach_sse42_vec128i \
46  _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47 #define foreach_sse42_vec128u \
48  _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49 #define foreach_sse42_vec128f \
50  _(f,32,4,ps) _(f,64,2,pd)
51 
52 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53  is_all_equal */
54 #define _(t, s, c, i) \
55 static_always_inline t##s##x##c \
56 t##s##x##c##_splat (t##s x) \
57 { return (t##s##x##c) _mm_set1_##i (x); } \
58 \
59 static_always_inline t##s##x##c \
60 t##s##x##c##_load_unaligned (void *p) \
61 { return (t##s##x##c) _mm_loadu_si128 (p); } \
62 \
63 static_always_inline void \
64 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65 { _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66 \
67 static_always_inline int \
68 t##s##x##c##_is_all_zero (t##s##x##c x) \
69 { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70 \
71 static_always_inline int \
72 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73 { return t##s##x##c##_is_all_zero (a ^ b); } \
74 \
75 static_always_inline int \
76 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78 
80 #undef _
81 
82 /* min, max */
83 #define _(t, s, c, i) \
84 static_always_inline t##s##x##c \
85 t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
86 { return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
87 \
88 static_always_inline t##s##x##c \
89 t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
90 { return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
91 
92 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
93 _(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
94 #undef _
95 /* *INDENT-ON* */
96 
97 #define CLIB_VEC128_SPLAT_DEFINED
98 #define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
99 
100 /* 128 bit interleaves. */
101 always_inline u8x16
102 u8x16_interleave_hi (u8x16 a, u8x16 b)
103 {
104  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
105 }
106 
107 always_inline u8x16
108 u8x16_interleave_lo (u8x16 a, u8x16 b)
109 {
110  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
111 }
112 
113 always_inline u16x8
114 u16x8_interleave_hi (u16x8 a, u16x8 b)
115 {
116  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
117 }
118 
119 always_inline u16x8
120 u16x8_interleave_lo (u16x8 a, u16x8 b)
121 {
122  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
123 }
124 
127 {
128  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
129 }
130 
133 {
134  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
135 }
136 
139 {
140  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
141 }
142 
145 {
146  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
147 }
148 
149 /* 64 bit interleaves. */
150 always_inline u8x8
151 u8x8_interleave_hi (u8x8 a, u8x8 b)
152 {
153  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
154 }
155 
156 always_inline u8x8
157 u8x8_interleave_lo (u8x8 a, u8x8 b)
158 {
159  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
160 }
161 
162 always_inline u16x4
163 u16x4_interleave_hi (u16x4 a, u16x4 b)
164 {
165  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
166 }
167 
168 always_inline u16x4
169 u16x4_interleave_lo (u16x4 a, u16x4 b)
170 {
171  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
172 }
173 
174 always_inline u32x2
175 u32x2_interleave_hi (u32x2 a, u32x2 b)
176 {
177  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
178 }
179 
180 always_inline u32x2
181 u32x2_interleave_lo (u32x2 a, u32x2 b)
182 {
183  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
184 }
185 
186 /* 128 bit packs. */
187 always_inline u8x16
188 u16x8_pack (u16x8 lo, u16x8 hi)
189 {
190  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
191 }
192 
193 always_inline i8x16
195 {
196  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
197 }
198 
199 always_inline u16x8
201 {
202  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
203 }
204 
205 /* 64 bit packs. */
206 always_inline u8x8
207 u16x4_pack (u16x4 lo, u16x4 hi)
208 {
209  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
210 }
211 
212 always_inline i8x8
213 i16x4_pack (i16x4 lo, i16x4 hi)
214 {
215  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
216 }
217 
218 always_inline u16x4
219 u32x2_pack (u32x2 lo, u32x2 hi)
220 {
221  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
222 }
223 
224 always_inline i16x4
225 i32x2_pack (i32x2 lo, i32x2 hi)
226 {
227  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
228 }
229 
230 #ifndef __ICC
233 {
234  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
235 }
236 
239 {
240  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
241 }
242 
243 always_inline void
245 {
246  _mm_storel_pi ((__m64 *) a, (__m128) x);
247 }
248 
249 always_inline void
251 {
252  _mm_storeh_pi ((__m64 *) a, (__m128) x);
253 }
254 #endif
255 
256 #define _signed_binop(n,m,f,g) \
257  /* Unsigned */ \
258  always_inline u##n##x##m \
259  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
260  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
261  \
262  /* Signed */ \
263  always_inline i##n##x##m \
264  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
265  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
266 /* Addition/subtraction with saturation. */
267 _signed_binop (8, 16, add_saturate, adds_epu)
268 _signed_binop (16, 8, add_saturate, adds_epu)
269 _signed_binop (8, 16, sub_saturate, subs_epu)
270 _signed_binop (16, 8, sub_saturate, subs_epu)
271 /* Multiplication. */
273 {
274  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
275 }
276 
277 always_inline u16x8
278 u16x8_mul_lo (u16x8 x, u16x8 y)
279 {
280  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
281 }
282 
285 {
286  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
287 }
288 
289 always_inline u16x8
290 u16x8_mul_hi (u16x8 x, u16x8 y)
291 {
292  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
293 }
294 
295 /* 128 bit shifts. */
296 
297 #define _(p,a,b,c,f) \
298  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
299  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
300  \
301  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
302  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
303 
304 _(u, 16, 8, left, sll)
305 _(u, 32, 4, left, sll)
306 _(u, 64, 2, left, sll)
307 _(u, 16, 8, right, srl)
308 _(u, 32, 4, right, srl)
309 _(u, 64, 2, right, srl)
310 _(i, 16, 8, left, sll)
311 _(i, 32, 4, left, sll)
312 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
313 #undef _
314 /* 64 bit shifts. */
315  always_inline u16x4
316 u16x4_shift_left (u16x4 x, u16x4 i)
317 {
318  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
319 };
320 
321 always_inline u32x2
322 u32x2_shift_left (u32x2 x, u32x2 i)
323 {
324  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
325 };
326 
327 always_inline u16x4
328 u16x4_shift_right (u16x4 x, u16x4 i)
329 {
330  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
331 };
332 
333 always_inline u32x2
334 u32x2_shift_right (u32x2 x, u32x2 i)
335 {
336  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
337 };
338 
339 always_inline i16x4
340 i16x4_shift_left (i16x4 x, i16x4 i)
341 {
342  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
343 };
344 
345 always_inline i32x2
346 i32x2_shift_left (i32x2 x, i32x2 i)
347 {
348  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
349 };
350 
351 always_inline i16x4
352 i16x4_shift_right (i16x4 x, i16x4 i)
353 {
354  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
355 };
356 
357 always_inline i32x2
358 i32x2_shift_right (i32x2 x, i32x2 i)
359 {
360  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
361 };
362 
363 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
364 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
365 
366 #define i8x16_word_shift_left(a,n) \
367  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
368 #define i8x16_word_shift_right(a,n) \
369  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
370 
371 #define u16x8_word_shift_left(a,n) \
372  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
373 #define i16x8_word_shift_left(a,n) \
374  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
375 #define u16x8_word_shift_right(a,n) \
376  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
377 #define i16x8_word_shift_right(a,n) \
378  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
379 
380 #define u32x4_word_shift_left(a,n) \
381  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
382 #define i32x4_word_shift_left(a,n) \
383  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
384 #define u32x4_word_shift_right(a,n) \
385  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
386 #define i32x4_word_shift_right(a,n) \
387  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
388 
389 #define u64x2_word_shift_left(a,n) \
390  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
391 #define i64x2_word_shift_left(a,n) \
392  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
393 #define u64x2_word_shift_right(a,n) \
394  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
395 #define i64x2_word_shift_right(a,n) \
396  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
397 
398 /* SSE2 has no rotate instructions: use shifts to simulate them. */
399 #define _(t,n,lr1,lr2) \
400  always_inline t##x##n \
401  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
402  { \
403  ASSERT (i >= 0 && i <= BITS (t)); \
404  return (t##x##n##_ishift_##lr1 (w, i) \
405  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
406  } \
407  \
408  always_inline t##x##n \
409  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
410  { \
411  t##x##n j = t##x##n##_splat (BITS (t)); \
412  return (t##x##n##_shift_##lr1 (w, i) \
413  | t##x##n##_shift_##lr2 (w, j - i)); \
414  }
415 
416 _(u16, 8, left, right);
417 _(u16, 8, right, left);
418 _(u32, 4, left, right);
419 _(u32, 4, right, left);
420 _(u64, 2, left, right);
421 _(u64, 2, right, left);
422 
423 #undef _
424 
425 #ifndef __clang__
426 #define _(t,n,lr1,lr2) \
427  always_inline t##x##n \
428  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
429  { \
430  int m = sizeof (t##x##n) / sizeof (t); \
431  ASSERT (i >= 0 && i < m); \
432  return (t##x##n##_word_shift_##lr1 (w0, i) \
433  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
434  } \
435  \
436  always_inline t##x##n \
437  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
438  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
439 
440 _(u8, 16, left, right);
441 _(u8, 16, right, left);
442 _(u16, 8, left, right);
443 _(u16, 8, right, left);
444 _(u32, 4, left, right);
445 _(u32, 4, right, left);
446 _(u64, 2, left, right);
447 _(u64, 2, right, left);
448 
449 #undef _
450 #endif
451 
452 #define u32x4_select(A,MASK) \
453 ({ \
454  u32x4 _x, _y; \
455  _x = (A); \
456  asm volatile ("pshufd %[mask], %[x], %[y]" \
457  : /* outputs */ [y] "=x" (_y) \
458  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
459  _y; \
460 })
461 
462 #define u32x4_splat_word(x,i) \
463  u32x4_select ((x), (((i) << (2*0)) \
464  | ((i) << (2*1)) \
465  | ((i) << (2*2)) \
466  | ((i) << (2*3))))
467 
468 /* Extract low order 32 bit word. */
471 {
472  u32 result;
473  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
474  : /* inputs */ [x] "x" (x));
475  return result;
476 }
477 
480 {
481  u32x4 result;
482  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
483  : /* inputs */ [x] "r" (x));
484  return result;
485 }
486 
489 {
490  return (i32x4) u32x4_set0 ((u32) x);
491 }
492 
495 {
496  return (i32) u32x4_get0 ((u32x4) x);
497 }
498 
499 /* Converts all ones/zeros compare mask to bitmap. */
502 {
503  return _mm_movemask_epi8 ((__m128i) x);
504 }
505 
507 
510 {
511  u32 m = u8x16_compare_byte_mask ((u8x16) x);
512  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
513  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
514 }
515 
518 {
519  u8x16 zero = { 0 };
520  return u8x16_compare_byte_mask (x == zero);
521 }
522 
525 {
526  u16x8 zero = { 0 };
527  return u8x16_compare_byte_mask ((u8x16) (x == zero));
528 }
529 
532 {
533  u32x4 zero = { 0 };
534  return u8x16_compare_byte_mask ((u8x16) (x == zero));
535 }
536 
539 {
540  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
541  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
542  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
543  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
544  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
545 }
546 
549 {
550  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
551  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
552  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
553  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
554  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
555 }
556 
559 {
560  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
561  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
562  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
563  return _mm_extract_epi16 ((__m128i) x, 0);
564 }
565 
568 {
569  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
570  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
571  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
572  return _mm_extract_epi16 ((__m128i) x, 0);
573 }
574 
575 #define u8x16_align_right(a, b, imm) \
576  (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
577 
580 {
581  v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
582  v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
583  return v[0];
584 }
585 
588 {
589  v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
590  v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
591  return v[0];
592 }
593 
596 {
597  v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
598  v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
599  return v[0];
600 }
601 
604 {
605  v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
606  v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
607  return v[0];
608 }
609 
611 u8x16_msb_mask (u8x16 v)
612 {
613  return _mm_movemask_epi8 ((__m128i) v);
614 }
615 
616 #define CLIB_HAVE_VEC128_MSB_MASK
617 
618 #undef _signed_binop
619 
622 {
623  u8x16 swap = {
624  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
625  };
626  return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
627 }
628 
631 {
632  u8x16 swap = {
633  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
634  };
635  return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
636 }
637 
639 u8x16_reflect (u8x16 v)
640 {
641  u8x16 mask = {
642  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
643  };
644  return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) mask);
645 }
646 
649 {
650  return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
651 }
652 
654 u8x16_shuffle (u8x16 v, u8x16 m)
655 {
656  return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
657 }
658 
660 u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
661 {
662 #if defined(__clang__) || !__OPTIMIZE__
663  u32x4 r = { v[a], v[b], v[c], v[d] };
664  return r;
665 #else
666  return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
667  a | b << 2 | c << 4 | d << 6);
668 #endif
669 }
670 
671 /* _extend_to_ */
672 /* *INDENT-OFF* */
673 #define _(f,t,i) \
674 static_always_inline t \
675 f##_extend_to_##t (f x) \
676 { return (t) _mm_cvt##i ((__m128i) x); }
677 
678 _(u8x16, u16x8, epu8_epi16)
679 _(u8x16, u32x4, epu8_epi32)
680 _(u8x16, u64x2, epu8_epi64)
681 _(u16x8, u32x4, epu16_epi32)
682 _(u16x8, u64x2, epu16_epi64)
683 _(u32x4, u64x2, epu32_epi64)
684 
685 _(i8x16, i16x8, epi8_epi16)
686 _(i8x16, i32x4, epi8_epi32)
687 _(i8x16, i64x2, epi8_epi64)
688 _(i16x8, i32x4, epi16_epi32)
689 _(i16x8, i64x2, epi16_epi64)
690 _(i32x4, i64x2, epi32_epi64)
691 #undef _
692 /* *INDENT-ON* */
693 
695 u64x2_gather (void *p0, void *p1)
696 {
697  u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
698  return r;
699 }
700 
702 u32x4_gather (void *p0, void *p1, void *p2, void *p3, void *p4)
703 {
704  u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
705  return r;
706 }
707 
708 
710 u64x2_scatter (u64x2 r, void *p0, void *p1)
711 {
712  *(u64 *) p0 = r[0];
713  *(u64 *) p1 = r[1];
714 }
715 
717 u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
718 {
719  *(u32 *) p0 = r[0];
720  *(u32 *) p1 = r[1];
721  *(u32 *) p2 = r[2];
722  *(u32 *) p3 = r[3];
723 }
724 
726 u64x2_scatter_one (u64x2 r, int index, void *p)
727 {
728  *(u64 *) p = r[index];
729 }
730 
732 u32x4_scatter_one (u32x4 r, int index, void *p)
733 {
734  *(u32 *) p = r[index];
735 }
736 
738 u8x16_is_greater (u8x16 v1, u8x16 v2)
739 {
740  return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
741 }
742 
744 u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
745 {
746  return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
747 }
748 
750 u8x16_xor3 (u8x16 a, u8x16 b, u8x16 c)
751 {
752 #if __AVX512F__
753  return (u8x16) _mm_ternarylogic_epi32 ((__m128i) a, (__m128i) b,
754  (__m128i) c, 0x96);
755 #endif
756  return a ^ b ^ c;
757 }
758 
759 #endif /* included_vector_sse2_h */
760 
761 /*
762  * fd.io coding-style-patch-verification: ON
763  *
764  * Local Variables:
765  * eval: (c-set-style "gnu")
766  * End:
767  */
static u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse42.h:175
#define u8x16_word_shift_right(a, n)
Definition: vector_sse42.h:364
static u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse42.h:138
static u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse42.h:132
static u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse42.h:278
#define u8x16_align_right(a, b, imm)
Definition: vector_sse42.h:575
static_always_inline u32 u32x4_min_scalar(u32x4 v)
Definition: vector_sse42.h:579
sll right
Definition: vector_sse42.h:307
static i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse42.h:558
a
Definition: bitmap.h:538
static u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse42.h:144
static_always_inline void u64x2_scatter_one(u64x2 r, int index, void *p)
Definition: vector_sse42.h:726
static_always_inline u32x4 u32x4_byte_swap(u32x4 v)
Definition: vector_sse42.h:621
static u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse42.h:207
unsigned long u64
Definition: types.h:89
static u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse42.h:188
static u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse42.h:163
static i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse42.h:194
#define foreach_sse42_vec128i
Definition: vector_sse42.h:45
static u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse42.h:151
static i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse42.h:352
static i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse42.h:284
static u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse42.h:114
static_always_inline u8x16 u8x16_blend(u8x16 v1, u8x16 v2, u8x16 mask)
Definition: vector_sse42.h:744
adds_epu sub_saturate
Definition: vector_sse42.h:270
static void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:250
static u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse42.h:169
unsigned char u8
Definition: types.h:56
static u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse42.h:524
static u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse42.h:120
static u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse42.h:181
static_always_inline u32x4 u32x4_gather(void *p0, void *p1, void *p2, void *p3, void *p4)
Definition: vector_sse42.h:702
static_always_inline u8x16 u8x16_xor3(u8x16 a, u8x16 b, u8x16 c)
Definition: vector_sse42.h:750
#define static_always_inline
Definition: clib.h:106
static u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse42.h:200
static_always_inline void u32x4_scatter(u32x4 r, void *p0, void *p1, void *p2, void *p3)
Definition: vector_sse42.h:717
static_always_inline u16 u8x16_msb_mask(u8x16 v)
Definition: vector_sse42.h:611
static i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse42.h:346
static u32 u32x4_get0(u32x4 x)
Definition: vector_sse42.h:470
static_always_inline u32 u32x4_max_scalar(u32x4 v)
Definition: vector_sse42.h:587
unsigned int u32
Definition: types.h:88
epu8_epi32 epu16_epi32 epu32_epi64 i32x4
Definition: vector_sse42.h:686
static i32x4 i32x4_set0(i32 x)
Definition: vector_sse42.h:488
adds_epu static subs_epu i16x8 i16x8_mul_lo(i16x8 x, i16x8 y)
Definition: vector_sse42.h:272
epu8_epi32 epu16_epi32 u64x2
Definition: vector_sse42.h:683
static u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse42.h:102
epu8_epi32 epu16_epi32 epu32_epi64 epi8_epi32 epi16_epi32 epi32_epi64 static_always_inline u64x2 u64x2_gather(void *p0, void *p1)
Definition: vector_sse42.h:695
static i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse42.h:567
static_always_inline u16x8 u16x8_byte_swap(u16x8 v)
Definition: vector_sse42.h:630
lo
static u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse42.h:548
unsigned short u16
Definition: types.h:57
static_always_inline u32x4 u32x4_shuffle(u32x4 v, const int a, const int b, const int c, const int d)
Definition: vector_sse42.h:660
static_always_inline void u64x2_scatter(u64x2 r, void *p0, void *p1)
Definition: vector_sse42.h:710
static_always_inline u8x16 u8x16_shuffle(u8x16 v, u8x16 m)
Definition: vector_sse42.h:654
static u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse42.h:322
#define i16x8_word_shift_right(a, n)
Definition: vector_sse42.h:377
#define always_inline
Definition: ipsec.h:28
static u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse42.h:517
add_saturate
Definition: vector_sse42.h:268
svmdb_client_t * c
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:317
static i32 i32x4_get0(i32x4 x)
Definition: vector_sse42.h:494
static_always_inline u8x16 u8x16_reflect(u8x16 v)
Definition: vector_sse42.h:639
static u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse42.h:126
epu8_epi32 epu16_epi32 epu32_epi64 epi8_epi32 epi16_epi32 i64x2
Definition: vector_sse42.h:690
signed int i32
Definition: types.h:77
static u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse42.h:501
u8 u32x4_compare_word_mask_table[256]
static_always_inline u32 i32x4_max_scalar(i32x4 v)
Definition: vector_sse42.h:603
static i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse42.h:340
static i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse42.h:213
static u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse42.h:538
static u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse42.h:334
vmrglw i16x8
left
Definition: vector_sse42.h:305
static i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse42.h:225
static u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse42.h:219
#define foreach_sse42_vec128u
Definition: vector_sse42.h:47
static_always_inline u8x16 u8x16_is_greater(u8x16 v1, u8x16 v2)
Definition: vector_sse42.h:738
static_always_inline void u32x4_scatter_one(u32x4 r, int index, void *p)
Definition: vector_sse42.h:732
static u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse42.h:328
vl_api_ip4_address_t hi
Definition: arp.api:37
static_always_inline u32 i32x4_min_scalar(i32x4 v)
Definition: vector_sse42.h:595
static u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse42.h:157
static u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse42.h:509
static u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse42.h:108
static u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:238
static u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse42.h:290
static i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse42.h:358
static void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:244
unsigned long long u32x4
Definition: ixge.c:28
static u32x4 u32x4_set0(u32 x)
Definition: vector_sse42.h:479
static u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:232
static u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse42.h:531
static_always_inline u32x4 u32x4_hadd(u32x4 v1, u32x4 v2)
Definition: vector_sse42.h:648
signed short i16
Definition: types.h:46