FD.io VPP  v18.11-rc0-18-g2a3fb1a
Vector Packet Processing
vector_sse42.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* *INDENT-OFF* */
45 #define foreach_sse42_vec128i \
46  _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47 #define foreach_sse42_vec128u \
48  _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49 #define foreach_sse42_vec128f \
50  _(f,32,4,ps) _(f,64,2,pd)
51 
52 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53  is_all_equal */
54 #define _(t, s, c, i) \
55 static_always_inline t##s##x##c \
56 t##s##x##c##_splat (t##s x) \
57 { return (t##s##x##c) _mm_set1_##i (x); } \
58 \
59 static_always_inline t##s##x##c \
60 t##s##x##c##_load_unaligned (void *p) \
61 { return (t##s##x##c) _mm_loadu_si128 (p); } \
62 \
63 static_always_inline void \
64 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65 { _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66 \
67 static_always_inline int \
68 t##s##x##c##_is_all_zero (t##s##x##c x) \
69 { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70 \
71 static_always_inline int \
72 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73 { return t##s##x##c##_is_all_zero (a ^ b); } \
74 \
75 static_always_inline int \
76 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78 
80 #undef _
81 /* *INDENT-ON* */
82 
83 #define CLIB_VEC128_SPLAT_DEFINED
84 #define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
85 
86 /* 128 bit interleaves. */
87 always_inline u8x16
88 u8x16_interleave_hi (u8x16 a, u8x16 b)
89 {
90  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
91 }
92 
93 always_inline u8x16
94 u8x16_interleave_lo (u8x16 a, u8x16 b)
95 {
96  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
97 }
98 
99 always_inline u16x8
100 u16x8_interleave_hi (u16x8 a, u16x8 b)
101 {
102  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
103 }
104 
105 always_inline u16x8
106 u16x8_interleave_lo (u16x8 a, u16x8 b)
107 {
108  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
109 }
110 
113 {
114  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
115 }
116 
119 {
120  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
121 }
122 
123 always_inline u64x2
124 u64x2_interleave_hi (u64x2 a, u64x2 b)
125 {
126  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
127 }
128 
129 always_inline u64x2
130 u64x2_interleave_lo (u64x2 a, u64x2 b)
131 {
132  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
133 }
134 
135 /* 64 bit interleaves. */
136 always_inline u8x8
137 u8x8_interleave_hi (u8x8 a, u8x8 b)
138 {
139  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
140 }
141 
142 always_inline u8x8
143 u8x8_interleave_lo (u8x8 a, u8x8 b)
144 {
145  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
146 }
147 
148 always_inline u16x4
149 u16x4_interleave_hi (u16x4 a, u16x4 b)
150 {
151  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
152 }
153 
154 always_inline u16x4
155 u16x4_interleave_lo (u16x4 a, u16x4 b)
156 {
157  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
158 }
159 
160 always_inline u32x2
161 u32x2_interleave_hi (u32x2 a, u32x2 b)
162 {
163  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
164 }
165 
166 always_inline u32x2
167 u32x2_interleave_lo (u32x2 a, u32x2 b)
168 {
169  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
170 }
171 
172 /* 128 bit packs. */
173 always_inline u8x16
174 u16x8_pack (u16x8 lo, u16x8 hi)
175 {
176  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
177 }
178 
179 always_inline i8x16
181 {
182  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
183 }
184 
185 always_inline u16x8
187 {
188  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
189 }
190 
191 /* 64 bit packs. */
192 always_inline u8x8
193 u16x4_pack (u16x4 lo, u16x4 hi)
194 {
195  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
196 }
197 
198 always_inline i8x8
199 i16x4_pack (i16x4 lo, i16x4 hi)
200 {
201  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
202 }
203 
204 always_inline u16x4
205 u32x2_pack (u32x2 lo, u32x2 hi)
206 {
207  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
208 }
209 
210 always_inline i16x4
211 i32x2_pack (i32x2 lo, i32x2 hi)
212 {
213  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
214 }
215 
216 #ifndef __ICC
217 always_inline u64x2
218 u64x2_read_lo (u64x2 x, u64 * a)
219 {
220  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
221 }
222 
223 always_inline u64x2
224 u64x2_read_hi (u64x2 x, u64 * a)
225 {
226  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
227 }
228 
229 always_inline void
230 u64x2_write_lo (u64x2 x, u64 * a)
231 {
232  _mm_storel_pi ((__m64 *) a, (__m128) x);
233 }
234 
235 always_inline void
236 u64x2_write_hi (u64x2 x, u64 * a)
237 {
238  _mm_storeh_pi ((__m64 *) a, (__m128) x);
239 }
240 #endif
241 
242 #define _signed_binop(n,m,f,g) \
243  /* Unsigned */ \
244  always_inline u##n##x##m \
245  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
246  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
247  \
248  /* Signed */ \
249  always_inline i##n##x##m \
250  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
251  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
252 /* Addition/subtraction with saturation. */
253 _signed_binop (8, 16, add_saturate, adds_epu)
254 _signed_binop (16, 8, add_saturate, adds_epu)
255 _signed_binop (8, 16, sub_saturate, subs_epu)
256 _signed_binop (16, 8, sub_saturate, subs_epu)
257 /* Multiplication. */
259 {
260  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
261 }
262 
263 always_inline u16x8
264 u16x8_mul_lo (u16x8 x, u16x8 y)
265 {
266  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
267 }
268 
271 {
272  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
273 }
274 
275 always_inline u16x8
276 u16x8_mul_hi (u16x8 x, u16x8 y)
277 {
278  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
279 }
280 
281 /* 128 bit shifts. */
282 
283 #define _(p,a,b,c,f) \
284  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
285  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
286  \
287  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
288  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
289 
290 _(u, 16, 8, left, sll)
291 _(u, 32, 4, left, sll)
292 _(u, 64, 2, left, sll)
293 _(u, 16, 8, right, srl)
294 _(u, 32, 4, right, srl)
295 _(u, 64, 2, right, srl)
296 _(i, 16, 8, left, sll)
297 _(i, 32, 4, left, sll)
298 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
299 #undef _
300 /* 64 bit shifts. */
301  always_inline u16x4
302 u16x4_shift_left (u16x4 x, u16x4 i)
303 {
304  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
305 };
306 
307 always_inline u32x2
308 u32x2_shift_left (u32x2 x, u32x2 i)
309 {
310  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
311 };
312 
313 always_inline u16x4
314 u16x4_shift_right (u16x4 x, u16x4 i)
315 {
316  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
317 };
318 
319 always_inline u32x2
320 u32x2_shift_right (u32x2 x, u32x2 i)
321 {
322  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
323 };
324 
325 always_inline i16x4
326 i16x4_shift_left (i16x4 x, i16x4 i)
327 {
328  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
329 };
330 
331 always_inline i32x2
332 i32x2_shift_left (i32x2 x, i32x2 i)
333 {
334  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
335 };
336 
337 always_inline i16x4
338 i16x4_shift_right (i16x4 x, i16x4 i)
339 {
340  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
341 };
342 
343 always_inline i32x2
344 i32x2_shift_right (i32x2 x, i32x2 i)
345 {
346  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
347 };
348 
349 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
350 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
351 
352 #define i8x16_word_shift_left(a,n) \
353  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
354 #define i8x16_word_shift_right(a,n) \
355  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
356 
357 #define u16x8_word_shift_left(a,n) \
358  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
359 #define i16x8_word_shift_left(a,n) \
360  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
361 #define u16x8_word_shift_right(a,n) \
362  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
363 #define i16x8_word_shift_right(a,n) \
364  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
365 
366 #define u32x4_word_shift_left(a,n) \
367  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
368 #define i32x4_word_shift_left(a,n) \
369  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
370 #define u32x4_word_shift_right(a,n) \
371  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
372 #define i32x4_word_shift_right(a,n) \
373  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
374 
375 #define u64x2_word_shift_left(a,n) \
376  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
377 #define i64x2_word_shift_left(a,n) \
378  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
379 #define u64x2_word_shift_right(a,n) \
380  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
381 #define i64x2_word_shift_right(a,n) \
382  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
383 
384 /* SSE2 has no rotate instructions: use shifts to simulate them. */
385 #define _(t,n,lr1,lr2) \
386  always_inline t##x##n \
387  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
388  { \
389  ASSERT (i >= 0 && i <= BITS (t)); \
390  return (t##x##n##_ishift_##lr1 (w, i) \
391  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
392  } \
393  \
394  always_inline t##x##n \
395  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
396  { \
397  t##x##n j = t##x##n##_splat (BITS (t)); \
398  return (t##x##n##_shift_##lr1 (w, i) \
399  | t##x##n##_shift_##lr2 (w, j - i)); \
400  }
401 
402 _(u16, 8, left, right);
403 _(u16, 8, right, left);
404 _(u32, 4, left, right);
405 _(u32, 4, right, left);
406 _(u64, 2, left, right);
407 _(u64, 2, right, left);
408 
409 #undef _
410 
411 #ifndef __clang__
412 #define _(t,n,lr1,lr2) \
413  always_inline t##x##n \
414  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
415  { \
416  int m = sizeof (t##x##n) / sizeof (t); \
417  ASSERT (i >= 0 && i < m); \
418  return (t##x##n##_word_shift_##lr1 (w0, i) \
419  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
420  } \
421  \
422  always_inline t##x##n \
423  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
424  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
425 
426 _(u8, 16, left, right);
427 _(u8, 16, right, left);
428 _(u16, 8, left, right);
429 _(u16, 8, right, left);
430 _(u32, 4, left, right);
431 _(u32, 4, right, left);
432 _(u64, 2, left, right);
433 _(u64, 2, right, left);
434 
435 #undef _
436 #endif
437 
438 #define u32x4_select(A,MASK) \
439 ({ \
440  u32x4 _x, _y; \
441  _x = (A); \
442  asm volatile ("pshufd %[mask], %[x], %[y]" \
443  : /* outputs */ [y] "=x" (_y) \
444  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
445  _y; \
446 })
447 
448 #define u32x4_splat_word(x,i) \
449  u32x4_select ((x), (((i) << (2*0)) \
450  | ((i) << (2*1)) \
451  | ((i) << (2*2)) \
452  | ((i) << (2*3))))
453 
454 /* Extract low order 32 bit word. */
457 {
458  u32 result;
459  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
460  : /* inputs */ [x] "x" (x));
461  return result;
462 }
463 
466 {
467  u32x4 result;
468  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
469  : /* inputs */ [x] "r" (x));
470  return result;
471 }
472 
475 {
476  return (i32x4) u32x4_set0 ((u32) x);
477 }
478 
481 {
482  return (i32) u32x4_get0 ((u32x4) x);
483 }
484 
485 /* Converts all ones/zeros compare mask to bitmap. */
488 {
489  return _mm_movemask_epi8 ((__m128i) x);
490 }
491 
493 
496 {
497  u32 m = u8x16_compare_byte_mask ((u8x16) x);
498  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
499  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
500 }
501 
504 {
505  u8x16 zero = { 0 };
506  return u8x16_compare_byte_mask (x == zero);
507 }
508 
511 {
512  u16x8 zero = { 0 };
513  return u8x16_compare_byte_mask ((u8x16) (x == zero));
514 }
515 
518 {
519  u32x4 zero = { 0 };
520  return u8x16_compare_byte_mask ((u8x16) (x == zero));
521 }
522 
523 always_inline u8x16
524 u8x16_max (u8x16 x, u8x16 y)
525 {
526  return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
527 }
528 
531 {
532  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
533  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
534  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
535  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
536  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
537 }
538 
539 always_inline u8x16
540 u8x16_min (u8x16 x, u8x16 y)
541 {
542  return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
543 }
544 
547 {
548  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
549  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
550  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
551  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
552  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
553 }
554 
557 {
558  return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
559 }
560 
563 {
564  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
565  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
566  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
567  return _mm_extract_epi16 ((__m128i) x, 0);
568 }
569 
572 {
573  return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
574 }
575 
578 {
579  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
580  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
581  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
582  return _mm_extract_epi16 ((__m128i) x, 0);
583 }
584 
587 {
588  return _mm_movemask_epi8 ((__m128i) v);
589 }
590 
591 #define CLIB_HAVE_VEC128_MSB_MASK
592 
593 #undef _signed_binop
594 
597 {
598  u8x16 swap = {
599  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
600  };
601  return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
602 }
603 
606 {
607  return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
608 }
609 
610 #endif /* included_vector_sse2_h */
611 
612 /*
613  * fd.io coding-style-patch-verification: ON
614  *
615  * Local Variables:
616  * eval: (c-set-style "gnu")
617  * End:
618  */
static u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse42.h:161
#define u8x16_word_shift_right(a, n)
Definition: vector_sse42.h:350
static u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse42.h:124
vmrglw vmrglh hi
static u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse42.h:118
static u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse42.h:264
sll right
Definition: vector_sse42.h:293
static i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse42.h:562
a
Definition: bitmap.h:538
static u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse42.h:130
static u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse42.h:193
unsigned long u64
Definition: types.h:89
static u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse42.h:174
static u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse42.h:149
static i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse42.h:180
#define foreach_sse42_vec128i
Definition: vector_sse42.h:45
static u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse42.h:137
static i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse42.h:338
static i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse42.h:270
static u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse42.h:100
adds_epu sub_saturate
Definition: vector_sse42.h:256
static void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:236
static u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse42.h:155
unsigned char u8
Definition: types.h:56
static u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse42.h:510
static u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse42.h:106
static u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse42.h:167
i32x4
#define static_always_inline
Definition: clib.h:93
static u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse42.h:186
static i16x8 i16x8_min(i16x8 x, i16x8 y)
Definition: vector_sse42.h:571
#define always_inline
Definition: clib.h:92
unsigned long long u32x4
Definition: ixge.c:28
static_always_inline u16 u8x16_msb_mask(u8x16 v)
Definition: vector_sse42.h:586
static i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse42.h:332
static u32 u32x4_get0(u32x4 x)
Definition: vector_sse42.h:456
unsigned int u32
Definition: types.h:88
static i32x4 i32x4_set0(i32 x)
Definition: vector_sse42.h:474
adds_epu static subs_epu i16x8 i16x8_mul_lo(i16x8 x, i16x8 y)
Definition: vector_sse42.h:258
static u8x16 u8x16_max(u8x16 x, u8x16 y)
Definition: vector_sse42.h:524
static u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse42.h:88
static i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse42.h:577
static_always_inline u16x8 u16x8_byte_swap(u16x8 v)
Definition: vector_sse42.h:596
lo
static u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse42.h:546
#define v
Definition: acl.c:491
unsigned short u16
Definition: types.h:57
static u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse42.h:308
#define i16x8_word_shift_right(a, n)
Definition: vector_sse42.h:363
static u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse42.h:503
add_saturate
Definition: vector_sse42.h:254
static i16x8 i16x8_max(i16x8 x, i16x8 y)
Definition: vector_sse42.h:556
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:303
static i32 i32x4_get0(i32x4 x)
Definition: vector_sse42.h:480
static u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse42.h:112
signed int i32
Definition: types.h:81
static u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse42.h:487
u8 u32x4_compare_word_mask_table[256]
static i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse42.h:326
static i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse42.h:199
static u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse42.h:530
static u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse42.h:320
vmrglw i16x8
left
Definition: vector_sse42.h:291
static i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse42.h:211
static u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse42.h:205
#define foreach_sse42_vec128u
Definition: vector_sse42.h:47
static u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse42.h:314
static u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse42.h:143
static u8x16 u8x16_min(u8x16 x, u8x16 y)
Definition: vector_sse42.h:540
static u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse42.h:495
static u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse42.h:94
static u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:224
static u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse42.h:276
static i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse42.h:344
static void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:230
static u32x4 u32x4_set0(u32 x)
Definition: vector_sse42.h:465
static u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:218
static u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse42.h:517
static_always_inline u32x4 u32x4_hadd(u32x4 v1, u32x4 v2)
Definition: vector_sse42.h:605
signed short i16
Definition: types.h:46