$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_AVX_H
11 #define EIGEN_PACKET_MATH_AVX_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
22 #endif
23 
24 #if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
25 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
26 #endif
27 
28 #ifdef EIGEN_VECTORIZE_FMA
29 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
30 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
31 #endif
32 #endif
33 
34 typedef __m256 Packet8f;
35 typedef eigen_packet_wrapper<__m256i, 0> Packet8i;
36 typedef __m256d Packet4d;
37 #ifndef EIGEN_VECTORIZE_AVX512FP16
38 typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
39 #endif
40 typedef eigen_packet_wrapper<__m128i, 3> Packet8bf;
41 typedef eigen_packet_wrapper<__m256i, 4> Packet8ui;
42 
43 #ifdef EIGEN_VECTORIZE_AVX2
44 // Start from 3 to be compatible with AVX512
45 typedef eigen_packet_wrapper<__m256i, 3> Packet4l;
46 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
47 #endif
48 
49 template <>
50 struct is_arithmetic<__m256> {
51  enum { value = true };
52 };
53 template <>
54 struct is_arithmetic<__m256i> {
55  enum { value = true };
56 };
57 template <>
58 struct is_arithmetic<__m256d> {
59  enum { value = true };
60 };
61 template <>
62 struct is_arithmetic<Packet8i> {
63  enum { value = true };
64 };
65 // Note that `Packet8ui` uses the underlying type `__m256i`, which is
66 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
67 // operations used in `GenericPacketMath.h`.
68 template <>
69 struct is_arithmetic<Packet8ui> {
70  enum { value = false };
71 };
72 #ifndef EIGEN_VECTORIZE_AVX512FP16
73 template <>
74 struct is_arithmetic<Packet8h> {
75  enum { value = true };
76 };
77 #endif
78 template <>
79 struct is_arithmetic<Packet8bf> {
80  enum { value = true };
81 };
82 #ifdef EIGEN_VECTORIZE_AVX2
83 template <>
84 struct is_arithmetic<Packet4l> {
85  enum { value = true };
86 };
87 // Note that `Packet4ul` uses the underlying type `__m256i`, which is
88 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
89 // operations used in `GenericPacketMath.h`.
90 template <>
91 struct is_arithmetic<Packet4ul> {
92  enum { value = false };
93 };
94 #endif
95 
96 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
97 // to leverage AVX512 instructions.
98 #ifndef EIGEN_VECTORIZE_AVX512
99 template <>
100 struct packet_traits<float> : default_packet_traits {
101  typedef Packet8f type;
102  typedef Packet4f half;
103  enum {
104  Vectorizable = 1,
105  AlignedOnScalar = 1,
106  size = 8,
107 
108  HasCmp = 1,
109  HasDiv = 1,
110  HasReciprocal = EIGEN_FAST_MATH,
111  HasSin = EIGEN_FAST_MATH,
112  HasCos = EIGEN_FAST_MATH,
113  HasACos = 1,
114  HasASin = 1,
115  HasATan = 1,
116  HasATanh = 1,
117  HasLog = 1,
118  HasLog1p = 1,
119  HasExpm1 = 1,
120  HasExp = 1,
121  HasPow = 1,
122  HasNdtri = 1,
123  HasBessel = 1,
124  HasSqrt = 1,
125  HasRsqrt = 1,
126  HasCbrt = 1,
127  HasTanh = EIGEN_FAST_MATH,
128  HasErf = EIGEN_FAST_MATH,
129  HasErfc = EIGEN_FAST_MATH,
130  HasBlend = 1
131  };
132 };
133 template <>
134 struct packet_traits<double> : default_packet_traits {
135  typedef Packet4d type;
136  typedef Packet2d half;
137  enum {
138  Vectorizable = 1,
139  AlignedOnScalar = 1,
140  size = 4,
141 
142  HasCmp = 1,
143  HasDiv = 1,
144 #ifdef EIGEN_VECTORIZE_AVX2
145  HasSin = EIGEN_FAST_MATH,
146  HasCos = EIGEN_FAST_MATH,
147 #endif
148  HasTanh = EIGEN_FAST_MATH,
149  HasLog = 1,
150  HasErf = 1,
151  HasErfc = 1,
152  HasExp = 1,
153  HasPow = 1,
154  HasSqrt = 1,
155  HasRsqrt = 1,
156  HasCbrt = 1,
157  HasATan = 1,
158  HasATanh = 1,
159  HasBlend = 1
160  };
161 };
162 
163 template <>
164 struct packet_traits<Eigen::half> : default_packet_traits {
165  typedef Packet8h type;
166  // There is no half-size packet for Packet8h.
167  typedef Packet8h half;
168  enum {
169  Vectorizable = 1,
170  AlignedOnScalar = 1,
171  size = 8,
172 
173  HasCmp = 1,
174  HasAdd = 1,
175  HasSub = 1,
176  HasMul = 1,
177  HasDiv = 1,
178  HasSin = EIGEN_FAST_MATH,
179  HasCos = EIGEN_FAST_MATH,
180  HasNegate = 1,
181  HasAbs = 1,
182  HasAbs2 = 0,
183  HasMin = 1,
184  HasMax = 1,
185  HasConj = 1,
186  HasSetLinear = 0,
187  HasLog = 1,
188  HasLog1p = 1,
189  HasExpm1 = 1,
190  HasExp = 1,
191  HasSqrt = 1,
192  HasRsqrt = 1,
193  HasTanh = EIGEN_FAST_MATH,
194  HasErf = EIGEN_FAST_MATH,
195  HasBlend = 0,
196  HasBessel = 1,
197  HasNdtri = 1
198  };
199 };
200 
201 template <>
202 struct packet_traits<bfloat16> : default_packet_traits {
203  typedef Packet8bf type;
204  // There is no half-size packet for current Packet8bf.
205  // TODO: support as SSE path.
206  typedef Packet8bf half;
207  enum {
208  Vectorizable = 1,
209  AlignedOnScalar = 1,
210  size = 8,
211 
212  HasCmp = 1,
213  HasAdd = 1,
214  HasSub = 1,
215  HasMul = 1,
216  HasDiv = 1,
217  HasSin = EIGEN_FAST_MATH,
218  HasCos = EIGEN_FAST_MATH,
219  HasNegate = 1,
220  HasAbs = 1,
221  HasAbs2 = 0,
222  HasMin = 1,
223  HasMax = 1,
224  HasConj = 1,
225  HasSetLinear = 0,
226  HasLog = 1,
227  HasLog1p = 1,
228  HasExpm1 = 1,
229  HasExp = 1,
230  HasSqrt = 1,
231  HasRsqrt = 1,
232  HasTanh = EIGEN_FAST_MATH,
233  HasErf = EIGEN_FAST_MATH,
234  HasBlend = 0,
235  HasBessel = 1,
236  HasNdtri = 1
237  };
238 };
239 
240 template <>
241 struct packet_traits<int> : default_packet_traits {
242  typedef Packet8i type;
243  typedef Packet4i half;
244  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 8 };
245 };
246 template <>
247 struct packet_traits<uint32_t> : default_packet_traits {
248  typedef Packet8ui type;
249  typedef Packet4ui half;
250  enum {
251  Vectorizable = 1,
252  AlignedOnScalar = 1,
253  size = 8,
254 
255  HasDiv = 0,
256  HasNegate = 0,
257  HasSqrt = 0,
258 
259  HasCmp = 1,
260  HasMin = 1,
261  HasMax = 1,
262  HasShift = 1
263  };
264 };
265 
266 #ifdef EIGEN_VECTORIZE_AVX2
267 template <>
268 struct packet_traits<int64_t> : default_packet_traits {
269  typedef Packet4l type;
270  typedef Packet2l half;
271  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
272 };
273 template <>
274 struct packet_traits<uint64_t> : default_packet_traits {
275  typedef Packet4ul type;
276  // There is no half-size packet for current Packet4ul.
277  // TODO: support as SSE path.
278  typedef Packet4ul half;
279  enum {
280  Vectorizable = 1,
281  AlignedOnScalar = 1,
282  size = 4,
283 
284  // HasMin = 0,
285  // HasMax = 0,
286  HasDiv = 0,
287  HasBlend = 0,
288  HasTranspose = 0,
289  HasNegate = 0,
290  HasSqrt = 0,
291  HasCmp = 1,
292  HasShift = 1
293  };
294 };
295 #endif
296 
297 #endif
298 
299 template <>
300 struct scalar_div_cost<float, true> {
301  enum { value = 14 };
302 };
303 template <>
304 struct scalar_div_cost<double, true> {
305  enum { value = 16 };
306 };
307 
308 template <>
309 struct unpacket_traits<Packet8f> {
310  typedef float type;
311  typedef Packet4f half;
312  typedef Packet8i integer_packet;
313  typedef uint8_t mask_t;
314  enum {
315  size = 8,
316  alignment = Aligned32,
317  vectorizable = true,
318  masked_load_available = true,
319  masked_store_available = true
320 #ifdef EIGEN_VECTORIZE_AVX512
321  ,
322  masked_fpops_available = true
323 #endif
324  };
325 };
326 template <>
327 struct unpacket_traits<Packet4d> {
328  typedef double type;
329  typedef Packet2d half;
330 #ifdef EIGEN_VECTORIZE_AVX2
331  typedef Packet4l integer_packet;
332 #endif
333  enum {
334  size = 4,
335  alignment = Aligned32,
336  vectorizable = true,
337  masked_load_available = false,
338  masked_store_available = false
339  };
340 };
341 template <>
342 struct unpacket_traits<Packet8i> {
343  typedef int type;
344  typedef Packet4i half;
345  enum {
346  size = 8,
347  alignment = Aligned32,
348  vectorizable = true,
349  masked_load_available = false,
350  masked_store_available = false
351  };
352 };
353 template <>
354 struct unpacket_traits<Packet8ui> {
355  typedef uint32_t type;
356  typedef Packet4ui half;
357  enum {
358  size = 8,
359  alignment = Aligned32,
360  vectorizable = true,
361  masked_load_available = false,
362  masked_store_available = false
363  };
364 };
365 #ifdef EIGEN_VECTORIZE_AVX2
366 template <>
367 struct unpacket_traits<Packet4l> {
368  typedef int64_t type;
369  typedef Packet2l half;
370  enum {
371  size = 4,
372  alignment = Aligned32,
373  vectorizable = true,
374  masked_load_available = false,
375  masked_store_available = false
376  };
377 };
378 template <>
379 struct unpacket_traits<Packet4ul> {
380  typedef uint64_t type;
381  typedef Packet4ul half;
382  enum {
383  size = 4,
384  alignment = Aligned32,
385  vectorizable = true,
386  masked_load_available = false,
387  masked_store_available = false
388  };
389 };
390 #endif
391 template <>
392 struct unpacket_traits<Packet8bf> {
393  typedef bfloat16 type;
394  typedef Packet8bf half;
395  enum {
396  size = 8,
397  alignment = Aligned16,
398  vectorizable = true,
399  masked_load_available = false,
400  masked_store_available = false
401  };
402 };
403 
404 // Helper function for bit packing snippet of low precision comparison.
405 // It packs the flags from 16x16 to 8x16.
406 EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) {
407  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
408  _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
409 }
410 
411 #ifdef EIGEN_VECTORIZE_AVX2
412 template <>
413 EIGEN_STRONG_INLINE Packet4l pset1<Packet4l>(const int64_t& from) {
414  return _mm256_set1_epi64x(from);
415 }
416 template <>
417 EIGEN_STRONG_INLINE Packet4ul pset1<Packet4ul>(const uint64_t& from) {
418  return _mm256_set1_epi64x(numext::bit_cast<uint64_t>(from));
419 }
420 template <>
421 EIGEN_STRONG_INLINE Packet4l pzero(const Packet4l& /*a*/) {
422  return _mm256_setzero_si256();
423 }
424 template <>
425 EIGEN_STRONG_INLINE Packet4ul pzero(const Packet4ul& /*a*/) {
426  return _mm256_setzero_si256();
427 }
428 template <>
429 EIGEN_STRONG_INLINE Packet4l peven_mask(const Packet4l& /*a*/) {
430  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
431 }
432 template <>
433 EIGEN_STRONG_INLINE Packet4ul peven_mask(const Packet4ul& /*a*/) {
434  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
435 }
436 template <>
437 EIGEN_STRONG_INLINE Packet4l pload1<Packet4l>(const int64_t* from) {
438  return _mm256_set1_epi64x(*from);
439 }
440 template <>
441 EIGEN_STRONG_INLINE Packet4ul pload1<Packet4ul>(const uint64_t* from) {
442  return _mm256_set1_epi64x(*from);
443 }
444 template <>
445 EIGEN_STRONG_INLINE Packet4l padd<Packet4l>(const Packet4l& a, const Packet4l& b) {
446  return _mm256_add_epi64(a, b);
447 }
448 template <>
449 EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
450  return _mm256_add_epi64(a, b);
451 }
452 template <>
453 EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(const int64_t& a) {
454  return padd(pset1<Packet4l>(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
455 }
456 template <>
457 EIGEN_STRONG_INLINE Packet4ul plset<Packet4ul>(const uint64_t& a) {
458  return padd(pset1<Packet4ul>(a), Packet4ul(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
459 }
460 template <>
461 EIGEN_STRONG_INLINE Packet4l psub<Packet4l>(const Packet4l& a, const Packet4l& b) {
462  return _mm256_sub_epi64(a, b);
463 }
464 template <>
465 EIGEN_STRONG_INLINE Packet4ul psub<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
466  return _mm256_sub_epi64(a, b);
467 }
468 template <>
469 EIGEN_STRONG_INLINE Packet4l pnegate(const Packet4l& a) {
470  return psub(pzero(a), a);
471 }
472 template <>
473 EIGEN_STRONG_INLINE Packet4l pconj(const Packet4l& a) {
474  return a;
475 }
476 template <>
477 EIGEN_STRONG_INLINE Packet4l pcmp_le(const Packet4l& a, const Packet4l& b) {
478  return _mm256_xor_si256(_mm256_cmpgt_epi64(a, b), _mm256_set1_epi32(-1));
479 }
480 template <>
481 EIGEN_STRONG_INLINE Packet4ul pcmp_le(const Packet4ul& a, const Packet4ul& b) {
482  return (Packet4ul)pcmp_le((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
483  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
484 }
485 template <>
486 EIGEN_STRONG_INLINE Packet4l pcmp_lt(const Packet4l& a, const Packet4l& b) {
487  return _mm256_cmpgt_epi64(b, a);
488 }
489 template <>
490 EIGEN_STRONG_INLINE Packet4ul pcmp_lt(const Packet4ul& a, const Packet4ul& b) {
491  return (Packet4ul)pcmp_lt((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
492  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
493 }
494 template <>
495 EIGEN_STRONG_INLINE Packet4l pcmp_eq(const Packet4l& a, const Packet4l& b) {
496  return _mm256_cmpeq_epi64(a, b);
497 }
498 template <>
499 EIGEN_STRONG_INLINE Packet4ul pcmp_eq(const Packet4ul& a, const Packet4ul& b) {
500  return _mm256_cmpeq_epi64(a, b);
501 }
502 template <>
503 EIGEN_STRONG_INLINE Packet4l ptrue<Packet4l>(const Packet4l& a) {
504  return _mm256_cmpeq_epi64(a, a);
505 }
506 template <>
507 EIGEN_STRONG_INLINE Packet4ul ptrue<Packet4ul>(const Packet4ul& a) {
508  return _mm256_cmpeq_epi64(a, a);
509 }
510 template <>
511 EIGEN_STRONG_INLINE Packet4l pand<Packet4l>(const Packet4l& a, const Packet4l& b) {
512  return _mm256_and_si256(a, b);
513 }
514 template <>
515 EIGEN_STRONG_INLINE Packet4l por<Packet4l>(const Packet4l& a, const Packet4l& b) {
516  return _mm256_or_si256(a, b);
517 }
518 template <>
519 EIGEN_STRONG_INLINE Packet4l pxor<Packet4l>(const Packet4l& a, const Packet4l& b) {
520  return _mm256_xor_si256(a, b);
521 }
522 template <>
523 EIGEN_STRONG_INLINE Packet4ul pxor<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
524  return _mm256_xor_si256(a, b);
525 }
526 template <>
527 EIGEN_STRONG_INLINE Packet4l pandnot<Packet4l>(const Packet4l& a, const Packet4l& b) {
528  return _mm256_andnot_si256(b, a);
529 }
530 template <int N>
531 EIGEN_STRONG_INLINE Packet4l plogical_shift_right(Packet4l a) {
532  return _mm256_srli_epi64(a, N);
533 }
534 template <int N>
535 EIGEN_STRONG_INLINE Packet4l plogical_shift_left(Packet4l a) {
536  return _mm256_slli_epi64(a, N);
537 }
538 #ifdef EIGEN_VECTORIZE_AVX512FP16
539 template <int N>
540 EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) {
541  return _mm256_srai_epi64(a, N);
542 }
543 #else
544 template <int N>
545 EIGEN_STRONG_INLINE std::enable_if_t<(N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
546  return a;
547 }
548 template <int N>
549 EIGEN_STRONG_INLINE std::enable_if_t<(N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
550  __m256i hi_word = _mm256_srai_epi32(a, N);
551  __m256i lo_word = _mm256_srli_epi64(a, N);
552  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
553 }
554 template <int N>
555 EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
556  __m256i hi_word = _mm256_srai_epi32(a, 31);
557  __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
558  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
559 }
560 template <int N>
561 EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
562  return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
563 }
564 template <int N>
565 EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
566  return parithmetic_shift_right<int(N & 63)>(a);
567 }
568 #endif
569 template <>
570 EIGEN_STRONG_INLINE Packet4l pload<Packet4l>(const int64_t* from) {
571  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
572 }
573 template <>
574 EIGEN_STRONG_INLINE Packet4ul pload<Packet4ul>(const uint64_t* from) {
575  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
576 }
577 template <>
578 EIGEN_STRONG_INLINE Packet4l ploadu<Packet4l>(const int64_t* from) {
579  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
580 }
581 template <>
582 EIGEN_STRONG_INLINE Packet4ul ploadu<Packet4ul>(const uint64_t* from) {
583  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
584 }
585 // Loads 2 int64_ts from memory a returns the packet {a0, a0, a1, a1}
586 template <>
587 EIGEN_STRONG_INLINE Packet4l ploaddup<Packet4l>(const int64_t* from) {
588  const Packet4l a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
589  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
590 }
591 // Loads 2 uint64_ts from memory a returns the packet {a0, a0, a1, a1}
592 template <>
593 EIGEN_STRONG_INLINE Packet4ul ploaddup<Packet4ul>(const uint64_t* from) {
594  const Packet4ul a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
595  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
596 }
597 template <>
598 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet4l& from) {
599  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
600 }
601 template <>
602 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet4ul& from) {
603  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
604 }
605 template <>
606 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet4l& from) {
607  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
608 }
609 template <>
610 EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet4ul& from) {
611  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
612 }
613 template <>
614 EIGEN_DEVICE_FUNC inline Packet4l pgather<int64_t, Packet4l>(const int64_t* from, Index stride) {
615  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
616 }
617 template <>
618 EIGEN_DEVICE_FUNC inline Packet4ul pgather<uint64_t, Packet4ul>(const uint64_t* from, Index stride) {
619  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
620 }
621 template <>
622 EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index stride) {
623  __m128i low = _mm256_extractf128_si256(from, 0);
624  to[stride * 0] = _mm_extract_epi64_0(low);
625  to[stride * 1] = _mm_extract_epi64_1(low);
626 
627  __m128i high = _mm256_extractf128_si256(from, 1);
628  to[stride * 2] = _mm_extract_epi64_0(high);
629  to[stride * 3] = _mm_extract_epi64_1(high);
630 }
631 template <>
632 EIGEN_DEVICE_FUNC inline void pscatter<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index stride) {
633  __m128i low = _mm256_extractf128_si256(from, 0);
634  to[stride * 0] = _mm_extract_epi64_0(low);
635  to[stride * 1] = _mm_extract_epi64_1(low);
636 
637  __m128i high = _mm256_extractf128_si256(from, 1);
638  to[stride * 2] = _mm_extract_epi64_0(high);
639  to[stride * 3] = _mm_extract_epi64_1(high);
640 }
641 template <>
642 EIGEN_STRONG_INLINE void pstore1<Packet4l>(int64_t* to, const int64_t& a) {
643  Packet4l pa = pset1<Packet4l>(a);
644  pstore(to, pa);
645 }
646 template <>
647 EIGEN_STRONG_INLINE void pstore1<Packet4ul>(uint64_t* to, const uint64_t& a) {
648  Packet4ul pa = pset1<Packet4ul>(a);
649  pstore(to, pa);
650 }
651 template <>
652 EIGEN_STRONG_INLINE int64_t pfirst<Packet4l>(const Packet4l& a) {
653  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
654 }
655 template <>
656 EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
657  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
658 }
659 
660 #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
661 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
662  __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15);
663  __m256d T1 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 0);
664  __m256d T2 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 15);
665  __m256d T3 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 0);
666 
667  kernel.packet[1] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 32));
668  kernel.packet[3] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 49));
669  kernel.packet[0] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 32));
670  kernel.packet[2] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 49));
671 }
672 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ul, 4>& kernel) {
673  ptranspose((PacketBlock<Packet4l, 4>&)kernel);
674 }
675 template <>
676 EIGEN_STRONG_INLINE Packet4l pmin<Packet4l>(const Packet4l& a, const Packet4l& b) {
677  __m256i cmp = _mm256_cmpgt_epi64(a, b);
678  __m256i a_min = _mm256_andnot_si256(cmp, a);
679  __m256i b_min = _mm256_and_si256(cmp, b);
680  return Packet4l(_mm256_or_si256(a_min, b_min));
681 }
682 template <>
683 EIGEN_STRONG_INLINE Packet4ul pmin<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
684  return padd((Packet4ul)pmin((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
685  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
686  pset1<Packet4ul>(0x8000000000000000UL));
687 }
688 template <>
689 EIGEN_STRONG_INLINE Packet4l pmax<Packet4l>(const Packet4l& a, const Packet4l& b) {
690  __m256i cmp = _mm256_cmpgt_epi64(a, b);
691  __m256i a_min = _mm256_and_si256(cmp, a);
692  __m256i b_min = _mm256_andnot_si256(cmp, b);
693  return Packet4l(_mm256_or_si256(a_min, b_min));
694 }
695 template <>
696 EIGEN_STRONG_INLINE Packet4ul pmax<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
697  return padd((Packet4ul)pmax((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
698  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
699  pset1<Packet4ul>(0x8000000000000000UL));
700 }
701 template <>
702 EIGEN_STRONG_INLINE Packet4l pabs<Packet4l>(const Packet4l& a) {
703  Packet4l pz = pzero<Packet4l>(a);
704  Packet4l cmp = _mm256_cmpgt_epi64(a, pz);
705  return psub(cmp, pxor(a, cmp));
706 }
707 template <>
708 EIGEN_STRONG_INLINE Packet4ul pabs<Packet4ul>(const Packet4ul& a) {
709  return a;
710 }
711 template <>
712 EIGEN_STRONG_INLINE Packet4l pmul<Packet4l>(const Packet4l& a, const Packet4l& b) {
713  // 64-bit mul requires avx512, so do this with 32-bit multiplication
714  __m256i upper32_a = _mm256_srli_epi64(a, 32);
715  __m256i upper32_b = _mm256_srli_epi64(b, 32);
716 
717  // upper * lower
718  __m256i mul1 = _mm256_mul_epu32(upper32_a, b);
719  __m256i mul2 = _mm256_mul_epu32(upper32_b, a);
720  // Gives us both upper*upper and lower*lower
721  __m256i mul3 = _mm256_mul_epu32(a, b);
722 
723  __m256i high = _mm256_slli_epi64(_mm256_add_epi64(mul1, mul2), 32);
724  return _mm256_add_epi64(high, mul3);
725 }
726 template <>
727 EIGEN_STRONG_INLINE Packet4ul pmul<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
728  return (Packet4ul)pmul<Packet4l>((Packet4l)a, (Packet4l)b);
729 }
730 #endif
731 
732 template <>
733 EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
734  return _mm256_set1_ps(from);
735 }
736 template <>
737 EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) {
738  return _mm256_set1_pd(from);
739 }
740 template <>
741 EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) {
742  return _mm256_set1_epi32(from);
743 }
744 template <>
745 EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) {
746  return _mm256_set1_epi32(from);
747 }
748 
749 template <>
750 EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) {
751  return _mm256_castsi256_ps(pset1<Packet8i>(from));
752 }
753 template <>
754 EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) {
755  return _mm256_castsi256_pd(_mm256_set1_epi64x(from));
756 }
757 
758 template <>
759 EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) {
760  return _mm256_setzero_ps();
761 }
762 template <>
763 EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) {
764  return _mm256_setzero_pd();
765 }
766 template <>
767 EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) {
768  return _mm256_setzero_si256();
769 }
770 template <>
771 EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) {
772  return _mm256_setzero_si256();
773 }
774 
775 template <>
776 EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) {
777  return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1));
778 }
779 template <>
780 EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) {
781  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
782 }
783 template <>
784 EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) {
785  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
786 }
787 template <>
788 EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) {
789  return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1));
790 }
791 
792 template <>
793 EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) {
794  return _mm256_broadcast_ss(from);
795 }
796 template <>
797 EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) {
798  return _mm256_broadcast_sd(from);
799 }
800 
801 template <>
802 EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
803  return _mm256_add_ps(a, b);
804 }
805 #ifdef EIGEN_VECTORIZE_AVX512
806 template <>
807 EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b, uint8_t umask) {
808  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
809  return _mm512_castps512_ps256(_mm512_maskz_add_ps(mask, _mm512_castps256_ps512(a), _mm512_castps256_ps512(b)));
810 }
811 #endif
812 template <>
813 EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) {
814  return _mm256_add_pd(a, b);
815 }
816 template <>
817 EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
818 #ifdef EIGEN_VECTORIZE_AVX2
819  return _mm256_add_epi32(a, b);
820 #else
821  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
822  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
823  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
824 #endif
825 }
826 template <>
827 EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
828 #ifdef EIGEN_VECTORIZE_AVX2
829  return _mm256_add_epi32(a, b);
830 #else
831  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
832  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
833  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
834 #endif
835 }
836 
837 template <>
838 EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) {
839  return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
840 }
841 template <>
842 EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) {
843  return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0, 2.0, 1.0, 0.0));
844 }
845 template <>
846 EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) {
847  return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
848 }
849 template <>
850 EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) {
851  return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
852 }
853 
854 template <>
855 EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
856  return _mm256_sub_ps(a, b);
857 }
858 template <>
859 EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) {
860  return _mm256_sub_pd(a, b);
861 }
862 template <>
863 EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
864 #ifdef EIGEN_VECTORIZE_AVX2
865  return _mm256_sub_epi32(a, b);
866 #else
867  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
868  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
869  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
870 #endif
871 }
872 template <>
873 EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
874 #ifdef EIGEN_VECTORIZE_AVX2
875  return _mm256_sub_epi32(a, b);
876 #else
877  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
878  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
879  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
880 #endif
881 }
882 
883 template <>
884 EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
885  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
886  return _mm256_xor_ps(a, mask);
887 }
888 template <>
889 EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) {
890  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
891  return _mm256_xor_pd(a, mask);
892 }
893 template <>
894 EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a) {
895  return psub(pzero(a), a);
896 }
897 
898 template <>
899 EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) {
900  return a;
901 }
902 template <>
903 EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) {
904  return a;
905 }
906 template <>
907 EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) {
908  return a;
909 }
910 
911 template <>
912 EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
913  return _mm256_mul_ps(a, b);
914 }
915 template <>
916 EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) {
917  return _mm256_mul_pd(a, b);
918 }
919 template <>
920 EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
921 #ifdef EIGEN_VECTORIZE_AVX2
922  return _mm256_mullo_epi32(a, b);
923 #else
924  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
925  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
926  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
927 #endif
928 }
929 template <>
930 EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
931 #ifdef EIGEN_VECTORIZE_AVX2
932  return _mm256_mullo_epi32(a, b);
933 #else
934  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
935  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
936  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
937 #endif
938 }
939 
940 template <>
941 EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
942  return _mm256_div_ps(a, b);
943 }
944 template <>
945 EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) {
946  return _mm256_div_pd(a, b);
947 }
948 
949 template <>
950 EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b) {
951 #ifdef EIGEN_VECTORIZE_AVX512
952  return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
953 #else
954  Packet4i lo = pdiv<Packet4i>(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
955  Packet4i hi = pdiv<Packet4i>(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
956  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
957 #endif
958 }
959 
960 #ifdef EIGEN_VECTORIZE_FMA
961 template <>
962 EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
963  return _mm256_fmadd_ps(a, b, c);
964 }
965 template <>
966 EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
967  return _mm256_fmadd_pd(a, b, c);
968 }
969 
970 template <>
971 EIGEN_STRONG_INLINE Packet8f pmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
972  return _mm256_fmsub_ps(a, b, c);
973 }
974 
975 template <>
976 EIGEN_STRONG_INLINE Packet4d pmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
977  return _mm256_fmsub_pd(a, b, c);
978 }
979 
980 template <>
981 EIGEN_STRONG_INLINE Packet8f pnmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
982  return _mm256_fnmadd_ps(a, b, c);
983 }
984 
985 template <>
986 EIGEN_STRONG_INLINE Packet4d pnmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
987  return _mm256_fnmadd_pd(a, b, c);
988 }
989 
990 template <>
991 EIGEN_STRONG_INLINE Packet8f pnmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
992  return _mm256_fnmsub_ps(a, b, c);
993 }
994 
995 template <>
996 EIGEN_STRONG_INLINE Packet4d pnmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
997  return _mm256_fnmsub_pd(a, b, c);
998 }
999 
1000 #endif
1001 
1002 template <>
1003 EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
1004  return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
1005 }
1006 template <>
1007 EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
1008  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
1009 }
1010 template <>
1011 EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
1012  return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
1013 }
1014 template <>
1015 EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
1016  return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
1017 }
1018 template <>
1019 EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) {
1020  return _mm256_cmp_ps(a, a, _CMP_UNORD_Q);
1021 }
1022 
1023 template <>
1024 EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) {
1025  return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
1026 }
1027 template <>
1028 EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) {
1029  return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
1030 }
1031 template <>
1032 EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) {
1033  return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
1034 }
1035 template <>
1036 EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) {
1037  return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
1038 }
1039 
1040 template <>
1041 EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
1042 #ifdef EIGEN_VECTORIZE_AVX2
1043  return _mm256_xor_si256(_mm256_cmpgt_epi32(a, b), _mm256_set1_epi32(-1));
1044 #else
1045  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1046  lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
1047  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1048  hi = _mm_xor_si128(hi, _mm_set1_epi32(-1));
1049  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1050 #endif
1051 }
1052 template <>
1053 EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
1054 #ifdef EIGEN_VECTORIZE_AVX2
1055  return _mm256_cmpgt_epi32(b, a);
1056 #else
1057  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
1058  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
1059  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1060 #endif
1061 }
1062 template <>
1063 EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
1064 #ifdef EIGEN_VECTORIZE_AVX2
1065  return _mm256_cmpeq_epi32(a, b);
1066 #else
1067  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1068  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1069  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1070 #endif
1071 }
1072 template <>
1073 EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
1074 #ifdef EIGEN_VECTORIZE_AVX2
1075  return _mm256_cmpeq_epi32(a, b);
1076 #else
1077  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1078  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1079  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1080 #endif
1081 }
1082 
1083 template <>
1084 EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
1085 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1086  // There appears to be a bug in GCC, by which the optimizer may flip
1087  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
1088  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1089  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1090  Packet8f res;
1091  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1092  return res;
1093 #else
1094  // Arguments are swapped to match NaN propagation behavior of std::min.
1095  return _mm256_min_ps(b, a);
1096 #endif
1097 }
1098 template <>
1099 EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
1100 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1101  // See pmin above
1102  Packet4d res;
1103  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1104  return res;
1105 #else
1106  // Arguments are swapped to match NaN propagation behavior of std::min.
1107  return _mm256_min_pd(b, a);
1108 #endif
1109 }
1110 template <>
1111 EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
1112 #ifdef EIGEN_VECTORIZE_AVX2
1113  return _mm256_min_epi32(a, b);
1114 #else
1115  __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1116  __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1117  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1118 #endif
1119 }
1120 template <>
1121 EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1122 #ifdef EIGEN_VECTORIZE_AVX2
1123  return _mm256_min_epu32(a, b);
1124 #else
1125  __m128i lo = _mm_min_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1126  __m128i hi = _mm_min_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1127  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1128 #endif
1129 }
1130 
1131 template <>
1132 EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
1133 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1134  // See pmin above
1135  Packet8f res;
1136  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1137  return res;
1138 #else
1139  // Arguments are swapped to match NaN propagation behavior of std::max.
1140  return _mm256_max_ps(b, a);
1141 #endif
1142 }
1143 template <>
1144 EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
1145 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1146  // See pmin above
1147  Packet4d res;
1148  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1149  return res;
1150 #else
1151  // Arguments are swapped to match NaN propagation behavior of std::max.
1152  return _mm256_max_pd(b, a);
1153 #endif
1154 }
1155 template <>
1156 EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
1157 #ifdef EIGEN_VECTORIZE_AVX2
1158  return _mm256_max_epi32(a, b);
1159 #else
1160  __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1161  __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1162  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1163 #endif
1164 }
1165 template <>
1166 EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1167 #ifdef EIGEN_VECTORIZE_AVX2
1168  return _mm256_max_epu32(a, b);
1169 #else
1170  __m128i lo = _mm_max_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
1171  __m128i hi = _mm_max_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
1172  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1173 #endif
1174 }
1175 
1176 #ifdef EIGEN_VECTORIZE_AVX2
1177 template <>
1178 EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
1179  return _mm256_sign_epi32(_mm256_set1_epi32(1), a);
1180 }
1181 #endif
1182 
1183 // Add specializations for min/max with prescribed NaN propagation.
1184 template <>
1185 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
1186  return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
1187 }
1188 template <>
1189 EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
1190  return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
1191 }
1192 template <>
1193 EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
1194  return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
1195 }
1196 template <>
1197 EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
1198  return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
1199 }
1200 template <>
1201 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
1202  return pminmax_propagate_nan(a, b, pmin<Packet8f>);
1203 }
1204 template <>
1205 EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
1206  return pminmax_propagate_nan(a, b, pmin<Packet4d>);
1207 }
1208 template <>
1209 EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
1210  return pminmax_propagate_nan(a, b, pmax<Packet8f>);
1211 }
1212 template <>
1213 EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
1214  return pminmax_propagate_nan(a, b, pmax<Packet4d>);
1215 }
1216 
1217 template <>
1218 EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) {
1219  return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1220 }
1221 template <>
1222 EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) {
1223  return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1224 }
1225 
1226 template <>
1227 EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) {
1228  return _mm256_ceil_ps(a);
1229 }
1230 template <>
1231 EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) {
1232  return _mm256_ceil_pd(a);
1233 }
1234 
1235 template <>
1236 EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) {
1237  return _mm256_floor_ps(a);
1238 }
1239 template <>
1240 EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) {
1241  return _mm256_floor_pd(a);
1242 }
1243 
1244 template <>
1245 EIGEN_STRONG_INLINE Packet8f ptrunc<Packet8f>(const Packet8f& a) {
1246  return _mm256_round_ps(a, _MM_FROUND_TRUNC);
1247 }
1248 template <>
1249 EIGEN_STRONG_INLINE Packet4d ptrunc<Packet4d>(const Packet4d& a) {
1250  return _mm256_round_pd(a, _MM_FROUND_TRUNC);
1251 }
1252 
1253 template <>
1254 EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
1255 #ifdef EIGEN_VECTORIZE_AVX2
1256  // vpcmpeqd has lower latency than the more general vcmpps
1257  return _mm256_cmpeq_epi32(a, a);
1258 #else
1259  const __m256 b = _mm256_castsi256_ps(a);
1260  return _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_TRUE_UQ));
1261 #endif
1262 }
1263 
1264 template <>
1265 EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
1266 #ifdef EIGEN_VECTORIZE_AVX2
1267  // vpcmpeqd has lower latency than the more general vcmpps
1268  const __m256i b = _mm256_castps_si256(a);
1269  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b, b));
1270 #else
1271  return _mm256_cmp_ps(a, a, _CMP_TRUE_UQ);
1272 #endif
1273 }
1274 
1275 template <>
1276 EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
1277 #ifdef EIGEN_VECTORIZE_AVX2
1278  // vpcmpeqq has lower latency than the more general vcmppd
1279  const __m256i b = _mm256_castpd_si256(a);
1280  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b, b));
1281 #else
1282  return _mm256_cmp_pd(a, a, _CMP_TRUE_UQ);
1283 #endif
1284 }
1285 
1286 template <>
1287 EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) {
1288  return _mm256_and_ps(a, b);
1289 }
1290 template <>
1291 EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) {
1292  return _mm256_and_pd(a, b);
1293 }
1294 template <>
1295 EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
1296 #ifdef EIGEN_VECTORIZE_AVX2
1297  return _mm256_and_si256(a, b);
1298 #else
1299  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1300 #endif
1301 }
1302 template <>
1303 EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1304 #ifdef EIGEN_VECTORIZE_AVX2
1305  return _mm256_and_si256(a, b);
1306 #else
1307  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1308 #endif
1309 }
1310 
1311 template <>
1312 EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) {
1313  return _mm256_or_ps(a, b);
1314 }
1315 template <>
1316 EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) {
1317  return _mm256_or_pd(a, b);
1318 }
1319 template <>
1320 EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
1321 #ifdef EIGEN_VECTORIZE_AVX2
1322  return _mm256_or_si256(a, b);
1323 #else
1324  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1325 #endif
1326 }
1327 template <>
1328 EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1329 #ifdef EIGEN_VECTORIZE_AVX2
1330  return _mm256_or_si256(a, b);
1331 #else
1332  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1333 #endif
1334 }
1335 
1336 template <>
1337 EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) {
1338  return _mm256_xor_ps(a, b);
1339 }
1340 template <>
1341 EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) {
1342  return _mm256_xor_pd(a, b);
1343 }
1344 template <>
1345 EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
1346 #ifdef EIGEN_VECTORIZE_AVX2
1347  return _mm256_xor_si256(a, b);
1348 #else
1349  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1350 #endif
1351 }
1352 template <>
1353 EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1354 #ifdef EIGEN_VECTORIZE_AVX2
1355  return _mm256_xor_si256(a, b);
1356 #else
1357  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1358 #endif
1359 }
1360 
1361 template <>
1362 EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) {
1363  return _mm256_andnot_ps(b, a);
1364 }
1365 template <>
1366 EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) {
1367  return _mm256_andnot_pd(b, a);
1368 }
1369 template <>
1370 EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
1371 #ifdef EIGEN_VECTORIZE_AVX2
1372  return _mm256_andnot_si256(b, a);
1373 #else
1374  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
1375 #endif
1376 }
1377 template <>
1378 EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1379 #ifdef EIGEN_VECTORIZE_AVX2
1380  return _mm256_andnot_si256(b, a);
1381 #else
1382  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
1383 #endif
1384 }
1385 
1386 template <>
1387 EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
1388  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1389 }
1390 template <>
1391 EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
1392  return pcmp_eq(a, pmin(a, b));
1393 }
1394 
1395 template <>
1396 EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) {
1397  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
1398  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
1399  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1400 }
1401 template <>
1402 EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) {
1403  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
1404  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
1405  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1406 }
1407 
1408 template <>
1409 EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
1410  return _mm256_blendv_ps(b, a, mask);
1411 }
1412 template <>
1413 EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b) {
1414  return _mm256_castps_si256(
1415  _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
1416 }
1417 template <>
1418 EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b) {
1419  return _mm256_castps_si256(
1420  _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
1421 }
1422 
1423 template <>
1424 EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b) {
1425  return _mm256_blendv_pd(b, a, mask);
1426 }
1427 
1428 template <int N>
1429 EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
1430 #ifdef EIGEN_VECTORIZE_AVX2
1431  return _mm256_srai_epi32(a, N);
1432 #else
1433  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
1434  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
1435  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1436 #endif
1437 }
1438 
1439 template <int N>
1440 EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
1441 #ifdef EIGEN_VECTORIZE_AVX2
1442  return _mm256_srli_epi32(a, N);
1443 #else
1444  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
1445  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
1446  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1447 #endif
1448 }
1449 
1450 template <int N>
1451 EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
1452 #ifdef EIGEN_VECTORIZE_AVX2
1453  return _mm256_slli_epi32(a, N);
1454 #else
1455  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
1456  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
1457  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1458 #endif
1459 }
1460 
1461 template <int N>
1462 EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
1463  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
1464 }
1465 template <int N>
1466 EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
1467  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
1468 }
1469 template <int N>
1470 EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
1471  return (Packet8ui)plogical_shift_left<N>((Packet8i)a);
1472 }
1473 
1474 template <>
1475 EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
1476  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from);
1477 }
1478 template <>
1479 EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) {
1480  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from);
1481 }
1482 template <>
1483 EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) {
1484  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1485 }
1486 template <>
1487 EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) {
1488  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1489 }
1490 
1491 template <>
1492 EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
1493  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from);
1494 }
1495 template <>
1496 EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) {
1497  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from);
1498 }
1499 template <>
1500 EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) {
1501  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1502 }
1503 template <>
1504 EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) {
1505  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1506 }
1507 
1508 template <>
1509 EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
1510 #ifdef EIGEN_VECTORIZE_AVX512
1511  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
1512  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
1513 #else
1514  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
1515  const Packet8i bit_mask =
1516  _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
1517  mask = por<Packet8i>(mask, bit_mask);
1518  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1519  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
1520 #endif
1521 }
1522 
1523 // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
1524 template <>
1525 EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) {
1526  // TODO try to find a way to avoid the need of a temporary register
1527  // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
1528  // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
1529  // return _mm256_unpacklo_ps(tmp,tmp);
1530 
1531  // _mm256_insertf128_ps is very slow on Haswell, thus:
1532  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1533  // mimic an "inplace" permutation of the lower 128bits using a blend
1534  tmp = _mm256_blend_ps(
1535  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1536  // then we can perform a consistent permutation on the global register to get everything in shape:
1537  return _mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2));
1538 }
1539 // Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1}
1540 template <>
1541 EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) {
1542  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
1543  return _mm256_permute_pd(tmp, 3 << 2);
1544 }
1545 // Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3}
1546 template <>
1547 EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from) {
1548 #ifdef EIGEN_VECTORIZE_AVX2
1549  const Packet8i a = _mm256_castsi128_si256(ploadu<Packet4i>(from));
1550  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1551 #else
1552  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1553  // mimic an "inplace" permutation of the lower 128bits using a blend
1554  tmp = _mm256_blend_ps(
1555  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1556  // then we can perform a consistent permutation on the global register to get everything in shape:
1557  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
1558 #endif
1559 }
1560 template <>
1561 EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
1562 #ifdef EIGEN_VECTORIZE_AVX2
1563  const Packet8ui a = _mm256_castsi128_si256(ploadu<Packet4ui>(from));
1564  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1565 #else
1566  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1567  // mimic an "inplace" permutation of the lower 128bits using a blend
1568  tmp = _mm256_blend_ps(
1569  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1570  // then we can perform a consistent permutation on the global register to get
1571  // everything in shape:
1572  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
1573 #endif
1574 }
1575 
1576 // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
1577 template <>
1578 EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) {
1579  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
1580  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from + 1), 1);
1581 }
1582 template <>
1583 EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from) {
1584  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
1585 }
1586 template <>
1587 EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
1588  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
1589 }
1590 
1591 template <>
1592 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
1593  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from);
1594 }
1595 template <>
1596 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) {
1597  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from);
1598 }
1599 template <>
1600 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) {
1601  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1602 }
1603 template <>
1604 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) {
1605  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1606 }
1607 
1608 template <>
1609 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
1610  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from);
1611 }
1612 template <>
1613 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) {
1614  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from);
1615 }
1616 template <>
1617 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) {
1618  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1619 }
1620 template <>
1621 EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) {
1622  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1623 }
1624 
1625 template <>
1626 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
1627 #ifdef EIGEN_VECTORIZE_AVX512
1628  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
1629  EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
1630 #else
1631  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
1632  const Packet8i bit_mask =
1633  _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
1634  mask = por<Packet8i>(mask, bit_mask);
1635  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1636 #if EIGEN_COMP_MSVC
1637  // MSVC sometimes seems to use a bogus mask with maskstore.
1638  const __m256i ifrom = _mm256_castps_si256(from);
1639  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0),
1640  reinterpret_cast<char*>(to));
1641  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1),
1642  reinterpret_cast<char*>(to + 4));
1643 #else
1644  EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
1645 #endif
1646 #endif
1647 }
1648 
1649 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
1650 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride),
1651 // 4);
1652 template <>
1653 EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1654  return _mm256_set_ps(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
1655  from[2 * stride], from[1 * stride], from[0 * stride]);
1656 }
1657 template <>
1658 EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) {
1659  return _mm256_set_pd(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1660 }
1661 template <>
1662 EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride) {
1663  return _mm256_set_epi32(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
1664  from[2 * stride], from[1 * stride], from[0 * stride]);
1665 }
1666 template <>
1667 EIGEN_DEVICE_FUNC inline Packet8ui pgather<uint32_t, Packet8ui>(const uint32_t* from, Index stride) {
1668  return (Packet8ui)pgather<int, Packet8i>((int*)from, stride);
1669 }
1670 
1671 template <>
1672 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1673  __m128 low = _mm256_extractf128_ps(from, 0);
1674  to[stride * 0] = _mm_cvtss_f32(low);
1675  to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
1676  to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
1677  to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
1678 
1679  __m128 high = _mm256_extractf128_ps(from, 1);
1680  to[stride * 4] = _mm_cvtss_f32(high);
1681  to[stride * 5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
1682  to[stride * 6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
1683  to[stride * 7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
1684 }
1685 template <>
1686 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) {
1687  __m128d low = _mm256_extractf128_pd(from, 0);
1688  to[stride * 0] = _mm_cvtsd_f64(low);
1689  to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
1690  __m128d high = _mm256_extractf128_pd(from, 1);
1691  to[stride * 2] = _mm_cvtsd_f64(high);
1692  to[stride * 3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
1693 }
1694 template <>
1695 EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride) {
1696  __m128i low = _mm256_extractf128_si256(from, 0);
1697  to[stride * 0] = _mm_extract_epi32(low, 0);
1698  to[stride * 1] = _mm_extract_epi32(low, 1);
1699  to[stride * 2] = _mm_extract_epi32(low, 2);
1700  to[stride * 3] = _mm_extract_epi32(low, 3);
1701 
1702  __m128i high = _mm256_extractf128_si256(from, 1);
1703  to[stride * 4] = _mm_extract_epi32(high, 0);
1704  to[stride * 5] = _mm_extract_epi32(high, 1);
1705  to[stride * 6] = _mm_extract_epi32(high, 2);
1706  to[stride * 7] = _mm_extract_epi32(high, 3);
1707 }
1708 template <>
1709 EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
1710  pscatter<int, Packet8i>((int*)to, (Packet8i)from, stride);
1711 }
1712 
1713 template <>
1714 EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) {
1715  Packet8f pa = pset1<Packet8f>(a);
1716  pstore(to, pa);
1717 }
1718 template <>
1719 EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) {
1720  Packet4d pa = pset1<Packet4d>(a);
1721  pstore(to, pa);
1722 }
1723 template <>
1724 EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) {
1725  Packet8i pa = pset1<Packet8i>(a);
1726  pstore(to, pa);
1727 }
1728 
1729 #ifndef EIGEN_VECTORIZE_AVX512
1730 template <>
1731 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1732  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1733 }
1734 template <>
1735 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1736  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1737 }
1738 template <>
1739 EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1740  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1741 }
1742 template <>
1743 EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1744  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1745 }
1746 #endif
1747 
1748 template <>
1749 EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
1750  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
1751 }
1752 template <>
1753 EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
1754  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
1755 }
1756 template <>
1757 EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
1758  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
1759 }
1760 template <>
1761 EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
1762  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(a)));
1763 }
1764 
1765 template <>
1766 EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
1767  __m256 tmp = _mm256_shuffle_ps(a, a, 0x1b);
1768  return _mm256_permute2f128_ps(tmp, tmp, 1);
1769 }
1770 template <>
1771 EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) {
1772  __m256d tmp = _mm256_shuffle_pd(a, a, 5);
1773  return _mm256_permute2f128_pd(tmp, tmp, 1);
1774 #if 0
1775  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
1776  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
1777  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
1778  return _mm256_permute_pd(swap_halves,5);
1779 #endif
1780 }
1781 template <>
1782 EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) {
1783  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
1784 }
1785 template <>
1786 EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
1787  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
1788 }
1789 
1790 #ifdef EIGEN_VECTORIZE_AVX2
1791 template <>
1792 EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a) {
1793  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
1794 }
1795 template <>
1796 EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
1797  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
1798 }
1799 #endif
1800 
1801 // pabs should be ok
1802 template <>
1803 EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
1804  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
1805  return _mm256_and_ps(a, mask);
1806 }
1807 template <>
1808 EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
1809  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
1810  return _mm256_and_pd(a, mask);
1811 }
1812 template <>
1813 EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a) {
1814 #ifdef EIGEN_VECTORIZE_AVX2
1815  return _mm256_abs_epi32(a);
1816 #else
1817  __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(a, 0));
1818  __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(a, 1));
1819  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1820 #endif
1821 }
1822 template <>
1823 EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) {
1824  return a;
1825 }
1826 
1827 #ifndef EIGEN_VECTORIZE_AVX512FP16
1828 template <>
1829 EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
1830  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
1831 }
1832 #endif // EIGEN_VECTORIZE_AVX512FP16
1833 
1834 template <>
1835 EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
1836  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
1837 }
1838 template <>
1839 EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
1840 #ifdef EIGEN_VECTORIZE_AVX2
1841  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a)));
1842 #else
1843  return _mm256_castsi256_ps(parithmetic_shift_right<31>(Packet8i(_mm256_castps_si256(a))));
1844 #endif
1845 }
1846 template <>
1847 EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& /*unused*/) {
1848  return _mm256_setzero_si256();
1849 }
1850 #ifdef EIGEN_VECTORIZE_AVX2
1851 template <>
1852 EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
1853  return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a)));
1854 }
1855 template <>
1856 EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& /*unused*/) {
1857  return _mm256_setzero_si256();
1858 }
1859 #endif
1860 
1861 template <>
1862 EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
1863  return pfrexp_generic(a, exponent);
1864 }
1865 
1866 // Extract exponent without existence of Packet4l.
1867 template <>
1868 EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
1869  const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1870  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
1871 #ifdef EIGEN_VECTORIZE_AVX2
1872  a_expo = _mm256_srli_epi64(a_expo, 52);
1873  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1874  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1875 #else
1876  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1877  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1878  lo = _mm_srli_epi64(lo, 52);
1879  hi = _mm_srli_epi64(hi, 52);
1880 #endif
1881  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
1882  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
1883  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
1884  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
1885  return exponent;
1886 }
1887 
1888 template <>
1889 EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
1890  return pfrexp_generic(a, exponent);
1891 }
1892 
1893 template <>
1894 EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
1895  return pldexp_generic(a, exponent);
1896 }
1897 
1898 template <>
1899 EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
1900  // Clamp exponent to [-2099, 2099]
1901  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
1902  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1903 
1904  // Split 2^e into four factors and multiply.
1905  const Packet4i bias = pset1<Packet4i>(1023);
1906  Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4)
1907 
1908  // 2^b
1909  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
1910  Packet4i lo = _mm_slli_epi64(hi, 52);
1911  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1912  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1913  Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1914 
1915  // 2^(e - 3b)
1916  b = psub(psub(psub(e, b), b), b); // e - 3b
1917  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
1918  lo = _mm_slli_epi64(hi, 52);
1919  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1920  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1921  out = pmul(out, c); // a * 2^e
1922  return out;
1923 }
1924 
1925 template <>
1926 EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
1927  // Clamp exponent to [-1024, 1024]
1928  const Packet4d min_exponent = pset1<Packet4d>(-1023.0);
1929  const Packet4d max_exponent = pset1<Packet4d>(1024.0);
1930  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent));
1931  const Packet4i bias = pset1<Packet4i>(1023);
1932 
1933  // 2^e
1934  Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3);
1935  const Packet4i lo = _mm_slli_epi64(hi, 52);
1936  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1937  const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1938  return pmul(a, c); // a * 2^e
1939 }
1940 
1941 template <>
1942 EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
1943  return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
1944 }
1945 template <>
1946 EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a) {
1947  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
1948 }
1949 template <>
1950 EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
1951  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
1952 }
1953 
1954 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
1955  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1956  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1957  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1958  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1959  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1960  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1961  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1962  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1963  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1964  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1965  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1966  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1967  __m256 S4 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1968  __m256 S5 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1969  __m256 S6 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1970  __m256 S7 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1971  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
1972  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
1973  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
1974  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
1975  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
1976  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
1977  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
1978  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
1979 }
1980 
1981 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
1982  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1983  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1984  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1985  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1986 
1987  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1988  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1989  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1990  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1991 
1992  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
1993  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
1994  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
1995  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
1996 }
1997 
1998 #define MM256_SHUFFLE_EPI32(A, B, M) \
1999  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M))
2000 
2001 #ifndef EIGEN_VECTORIZE_AVX2
2002 #define MM256_UNPACKLO_EPI32(A, B) \
2003  _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
2004 #define MM256_UNPACKHI_EPI32(A, B) \
2005  _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
2006 #else
2007 #define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_epi32(A, B)
2008 #define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
2009 #endif
2010 
2011 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 8>& kernel) {
2012  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
2013  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
2014  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
2015  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
2016  __m256i T4 = MM256_UNPACKLO_EPI32(kernel.packet[4], kernel.packet[5]);
2017  __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]);
2018  __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]);
2019  __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]);
2020  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2021  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2022  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2023  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2024  __m256i S4 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
2025  __m256i S5 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
2026  __m256i S6 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
2027  __m256i S7 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
2028  kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
2029  kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
2030  kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
2031  kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20);
2032  kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31);
2033  kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31);
2034  kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31);
2035  kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31);
2036 }
2037 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8ui, 8>& kernel) {
2038  ptranspose((PacketBlock<Packet8i, 8>&)kernel);
2039 }
2040 
2041 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 4>& kernel) {
2042  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
2043  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
2044  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
2045  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
2046 
2047  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2048  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2049  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2050  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2051 
2052  kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
2053  kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
2054  kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31);
2055  kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31);
2056 }
2057 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8ui, 4>& kernel) {
2058  ptranspose((PacketBlock<Packet8i, 4>&)kernel);
2059 }
2060 
2061 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
2062  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
2063  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
2064  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
2065  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
2066 
2067  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
2068  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
2069  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
2070  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
2071 }
2072 
2073 EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) {
2074  return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
2075  0 - ifPacket.select[0]);
2076 }
2077 
2078 EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) {
2079  return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
2080  0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
2081  0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2082 }
2083 
2084 template <>
2085 EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
2086  const Packet8f& elsePacket) {
2087  const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
2088  return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
2089 }
2090 
2091 template <>
2092 EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
2093  const Packet4d& elsePacket) {
2094  const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
2095  return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
2096 }
2097 
2098 // Packet math for Eigen::half
2099 #ifndef EIGEN_VECTORIZE_AVX512FP16
2100 template <>
2101 struct unpacket_traits<Packet8h> {
2102  typedef Eigen::half type;
2103  enum {
2104  size = 8,
2105  alignment = Aligned16,
2106  vectorizable = true,
2107  masked_load_available = false,
2108  masked_store_available = false
2109  };
2110  typedef Packet8h half;
2111 };
2112 
2113 template <>
2114 EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
2115  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
2116 }
2117 
2118 template <>
2119 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
2120  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
2121 }
2122 
2123 template <>
2124 EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
2125  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
2126 }
2127 
2128 template <>
2129 EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
2130  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
2131 }
2132 
2133 template <>
2134 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
2135  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
2136 }
2137 
2138 template <>
2139 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
2140  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
2141 }
2142 
2143 template <>
2144 EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
2145  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2146  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2147  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
2148  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
2149  return _mm_set_epi16(d, d, c, c, b, b, a, a);
2150 }
2151 
2152 template <>
2153 EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
2154  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2155  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2156  return _mm_set_epi16(b, b, b, b, a, a, a, a);
2157 }
2158 
2159 template <>
2160 EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
2161  return _mm_cmpeq_epi32(a, a);
2162 }
2163 
2164 template <>
2165 EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
2166  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2167  return _mm_andnot_si128(sign_mask, a);
2168 }
2169 
2170 EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
2171 #ifdef EIGEN_HAS_FP16_C
2172  return _mm256_cvtph_ps(a);
2173 #else
2174  Eigen::internal::Packet8f pp = _mm256_castsi256_ps(
2175  _mm256_insertf128_si256(_mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
2176  return pp;
2177 #endif
2178 }
2179 
2180 EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
2181 #ifdef EIGEN_HAS_FP16_C
2182  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT);
2183 #else
2184  __m128i lo = float2half(_mm256_extractf128_ps(a, 0));
2185  __m128i hi = float2half(_mm256_extractf128_ps(a, 1));
2186  return _mm_packus_epi32(lo, hi);
2187 #endif
2188 }
2189 
2190 template <>
2191 EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
2192  return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
2193 }
2194 
2195 template <>
2196 EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
2197  return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
2198 }
2199 
2200 template <>
2201 EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
2202  return float2half(plset<Packet8f>(static_cast<float>(a)));
2203 }
2204 
2205 template <>
2206 EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
2207  // in some cases Packet4i is a wrapper around __m128i, so we either need to
2208  // cast to Packet4i to directly call the intrinsics as below:
2209  return _mm_or_si128(a, b);
2210 }
2211 template <>
2212 EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
2213  return _mm_xor_si128(a, b);
2214 }
2215 template <>
2216 EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
2217  return _mm_and_si128(a, b);
2218 }
2219 template <>
2220 EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
2221  return _mm_andnot_si128(b, a);
2222 }
2223 
2224 template <>
2225 EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
2226  return _mm_blendv_epi8(b, a, mask);
2227 }
2228 
2229 template <>
2230 EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
2231  return float2half(pround<Packet8f>(half2float(a)));
2232 }
2233 
2234 template <>
2235 EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
2236  return float2half(print<Packet8f>(half2float(a)));
2237 }
2238 
2239 template <>
2240 EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
2241  return float2half(pceil<Packet8f>(half2float(a)));
2242 }
2243 
2244 template <>
2245 EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
2246  return float2half(pfloor<Packet8f>(half2float(a)));
2247 }
2248 
2249 template <>
2250 EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
2251  return float2half(ptrunc<Packet8f>(half2float(a)));
2252 }
2253 
2254 template <>
2255 EIGEN_STRONG_INLINE Packet8h pisinf<Packet8h>(const Packet8h& a) {
2256  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
2257  constexpr uint16_t kAbsMask = (1 << 15) - 1;
2258  return _mm_cmpeq_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
2259 }
2260 
2261 template <>
2262 EIGEN_STRONG_INLINE Packet8h pisnan<Packet8h>(const Packet8h& a) {
2263  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
2264  constexpr uint16_t kAbsMask = (1 << 15) - 1;
2265  return _mm_cmpgt_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
2266 }
2267 
2268 // convert the sign-magnitude representation to two's complement
2269 EIGEN_STRONG_INLINE __m128i pmaptosigned(const __m128i& a) {
2270  constexpr uint16_t kAbsMask = (1 << 15) - 1;
2271  // if 'a' has the sign bit set, clear the sign bit and negate the result as if it were an integer
2272  return _mm_sign_epi16(_mm_and_si128(a, _mm_set1_epi16(kAbsMask)), a);
2273 }
2274 
2275 // return true if both `a` and `b` are not NaN
2276 EIGEN_STRONG_INLINE Packet8h pisordered(const Packet8h& a, const Packet8h& b) {
2277  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
2278  constexpr uint16_t kAbsMask = (1 << 15) - 1;
2279  __m128i abs_a = _mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask));
2280  __m128i abs_b = _mm_and_si128(b.m_val, _mm_set1_epi16(kAbsMask));
2281  // check if both `abs_a <= kInf` and `abs_b <= kInf` by checking if max(abs_a, abs_b) <= kInf
2282  // SSE has no `lesser or equal` instruction for integers, but comparing against kInf + 1 accomplishes the same goal
2283  return _mm_cmplt_epi16(_mm_max_epu16(abs_a, abs_b), _mm_set1_epi16(kInf + 1));
2284 }
2285 
2286 template <>
2287 EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
2288  __m128i isOrdered = pisordered(a, b);
2289  __m128i isEqual = _mm_cmpeq_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
2290  return _mm_and_si128(isOrdered, isEqual);
2291 }
2292 
2293 template <>
2294 EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
2295  __m128i isOrdered = pisordered(a, b);
2296  __m128i isGreater = _mm_cmpgt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
2297  return _mm_andnot_si128(isGreater, isOrdered);
2298 }
2299 
2300 template <>
2301 EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
2302  __m128i isOrdered = pisordered(a, b);
2303  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
2304  return _mm_and_si128(isOrdered, isLess);
2305 }
2306 
2307 template <>
2308 EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
2309  __m128i isUnordered = por(pisnan(a), pisnan(b));
2310  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
2311  return _mm_or_si128(isUnordered, isLess);
2312 }
2313 
2314 template <>
2315 EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) {
2316  return a;
2317 }
2318 
2319 template <>
2320 EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
2321  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2322  return _mm_xor_si128(a, sign_mask);
2323 }
2324 
2325 #ifndef EIGEN_VECTORIZE_AVX512FP16
2326 template <>
2327 EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
2328  Packet8f af = half2float(a);
2329  Packet8f bf = half2float(b);
2330  Packet8f rf = padd(af, bf);
2331  return float2half(rf);
2332 }
2333 
2334 template <>
2335 EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
2336  Packet8f af = half2float(a);
2337  Packet8f bf = half2float(b);
2338  Packet8f rf = psub(af, bf);
2339  return float2half(rf);
2340 }
2341 
2342 template <>
2343 EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
2344  Packet8f af = half2float(a);
2345  Packet8f bf = half2float(b);
2346  Packet8f rf = pmul(af, bf);
2347  return float2half(rf);
2348 }
2349 
2350 template <>
2351 EIGEN_STRONG_INLINE Packet8h pmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
2352  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
2353 }
2354 
2355 template <>
2356 EIGEN_STRONG_INLINE Packet8h pmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
2357  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
2358 }
2359 
2360 template <>
2361 EIGEN_STRONG_INLINE Packet8h pnmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
2362  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
2363 }
2364 
2365 template <>
2366 EIGEN_STRONG_INLINE Packet8h pnmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
2367  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
2368 }
2369 
2370 template <>
2371 EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
2372  Packet8f af = half2float(a);
2373  Packet8f bf = half2float(b);
2374  Packet8f rf = pdiv(af, bf);
2375  return float2half(rf);
2376 }
2377 #endif
2378 
2379 template <>
2380 EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
2381  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
2382  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
2383  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
2384  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
2385  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
2386  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
2387  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
2388  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
2389  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
2390 }
2391 
2392 template <>
2393 EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
2394  EIGEN_ALIGN32 Eigen::half aux[8];
2395  pstore(aux, from);
2396  to[stride * 0] = aux[0];
2397  to[stride * 1] = aux[1];
2398  to[stride * 2] = aux[2];
2399  to[stride * 3] = aux[3];
2400  to[stride * 4] = aux[4];
2401  to[stride * 5] = aux[5];
2402  to[stride * 6] = aux[6];
2403  to[stride * 7] = aux[7];
2404 }
2405 
2406 template <>
2407 EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
2408  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2409  return _mm_shuffle_epi8(a, m);
2410 }
2411 
2412 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
2413  __m128i a = kernel.packet[0];
2414  __m128i b = kernel.packet[1];
2415  __m128i c = kernel.packet[2];
2416  __m128i d = kernel.packet[3];
2417  __m128i e = kernel.packet[4];
2418  __m128i f = kernel.packet[5];
2419  __m128i g = kernel.packet[6];
2420  __m128i h = kernel.packet[7];
2421 
2422  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
2423  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
2424  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
2425  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2426  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
2427  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
2428  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
2429  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2430 
2431  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2432  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2433  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2434  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2435  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2436  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2437  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2438  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2439 
2440  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2441  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2442  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2443  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2444  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2445  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2446  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2447  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2448 
2449  kernel.packet[0] = a0b0c0d0e0f0g0h0;
2450  kernel.packet[1] = a1b1c1d1e1f1g1h1;
2451  kernel.packet[2] = a2b2c2d2e2f2g2h2;
2452  kernel.packet[3] = a3b3c3d3e3f3g3h3;
2453  kernel.packet[4] = a4b4c4d4e4f4g4h4;
2454  kernel.packet[5] = a5b5c5d5e5f5g5h5;
2455  kernel.packet[6] = a6b6c6d6e6f6g6h6;
2456  kernel.packet[7] = a7b7c7d7e7f7g7h7;
2457 }
2458 
2459 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
2460  EIGEN_ALIGN32 Eigen::half in[4][8];
2461  pstore<Eigen::half>(in[0], kernel.packet[0]);
2462  pstore<Eigen::half>(in[1], kernel.packet[1]);
2463  pstore<Eigen::half>(in[2], kernel.packet[2]);
2464  pstore<Eigen::half>(in[3], kernel.packet[3]);
2465 
2466  EIGEN_ALIGN32 Eigen::half out[4][8];
2467 
2468  for (int i = 0; i < 4; ++i) {
2469  for (int j = 0; j < 4; ++j) {
2470  out[i][j] = in[j][2 * i];
2471  }
2472  for (int j = 0; j < 4; ++j) {
2473  out[i][j + 4] = in[j][2 * i + 1];
2474  }
2475  }
2476 
2477  kernel.packet[0] = pload<Packet8h>(out[0]);
2478  kernel.packet[1] = pload<Packet8h>(out[1]);
2479  kernel.packet[2] = pload<Packet8h>(out[2]);
2480  kernel.packet[3] = pload<Packet8h>(out[3]);
2481 }
2482 
2483 #endif
2484 
2485 // BFloat16 implementation.
2486 
2487 EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
2488 #ifdef EIGEN_VECTORIZE_AVX2
2489  __m256i extend = _mm256_cvtepu16_epi32(a);
2490  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
2491 #else
2492  __m128i lo = _mm_cvtepu16_epi32(a);
2493  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
2494  __m128i lo_shift = _mm_slli_epi32(lo, 16);
2495  __m128i hi_shift = _mm_slli_epi32(hi, 16);
2496  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
2497 #endif
2498 }
2499 
2500 // Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
2501 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
2502  __m256i input = _mm256_castps_si256(a);
2503 
2504 #ifdef EIGEN_VECTORIZE_AVX2
2505  // uint32_t lsb = (input >> 16);
2506  __m256i t = _mm256_srli_epi32(input, 16);
2507  // uint32_t lsb = lsb & 1;
2508  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
2509  // uint32_t rounding_bias = 0x7fff + lsb;
2510  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
2511  // input += rounding_bias;
2512  t = _mm256_add_epi32(t, input);
2513  // input = input >> 16;
2514  t = _mm256_srli_epi32(t, 16);
2515  // Check NaN before converting back to bf16
2516  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
2517  __m256i nan = _mm256_set1_epi32(0x7fc0);
2518  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
2519  // output = numext::bit_cast<uint16_t>(input);
2520  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), _mm256_extractf128_si256(t, 1));
2521 #else
2522  // uint32_t lsb = (input >> 16);
2523  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
2524  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
2525  // uint32_t lsb = lsb & 1;
2526  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
2527  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
2528  // uint32_t rounding_bias = 0x7fff + lsb;
2529  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
2530  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
2531  // input += rounding_bias;
2532  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
2533  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
2534  // input = input >> 16;
2535  lo = _mm_srli_epi32(lo, 16);
2536  hi = _mm_srli_epi32(hi, 16);
2537  // Check NaN before converting back to bf16
2538  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
2539  __m128i nan = _mm_set1_epi32(0x7fc0);
2540  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
2541  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
2542  // output = numext::bit_cast<uint16_t>(input);
2543  return _mm_packus_epi32(lo, hi);
2544 #endif
2545 }
2546 
2547 template <>
2548 EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
2549  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
2550 }
2551 
2552 template <>
2553 EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
2554  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
2555 }
2556 
2557 template <>
2558 EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
2559  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
2560 }
2561 
2562 template <>
2563 EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
2564  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
2565 }
2566 
2567 template <>
2568 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
2569  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
2570 }
2571 
2572 template <>
2573 EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
2574  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
2575 }
2576 
2577 template <>
2578 EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
2579  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2580  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2581  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
2582  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
2583  return _mm_set_epi16(d, d, c, c, b, b, a, a);
2584 }
2585 
2586 template <>
2587 EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
2588  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2589  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2590  return _mm_set_epi16(b, b, b, b, a, a, a, a);
2591 }
2592 
2593 template <>
2594 EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
2595  return _mm_cmpeq_epi32(a, a);
2596 }
2597 
2598 template <>
2599 EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
2600  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2601  return _mm_andnot_si128(sign_mask, a);
2602 }
2603 
2604 template <>
2605 EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2606  return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
2607 }
2608 
2609 template <>
2610 EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2611  return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
2612 }
2613 
2614 template <>
2615 EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2616  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
2617 }
2618 
2619 template <>
2620 EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a, const Packet8bf& b) {
2621  return _mm_or_si128(a, b);
2622 }
2623 template <>
2624 EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, const Packet8bf& b) {
2625  return _mm_xor_si128(a, b);
2626 }
2627 template <>
2628 EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a, const Packet8bf& b) {
2629  return _mm_and_si128(a, b);
2630 }
2631 template <>
2632 EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a, const Packet8bf& b) {
2633  return _mm_andnot_si128(b, a);
2634 }
2635 
2636 template <>
2637 EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
2638  return _mm_blendv_epi8(b, a, mask);
2639 }
2640 
2641 template <>
2642 EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
2643  return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
2644 }
2645 
2646 template <>
2647 EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
2648  return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
2649 }
2650 
2651 template <>
2652 EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
2653  return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
2654 }
2655 
2656 template <>
2657 EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
2658  return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
2659 }
2660 
2661 template <>
2662 EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
2663  return F32ToBf16(ptrunc<Packet8f>(Bf16ToF32(a)));
2664 }
2665 
2666 template <>
2667 EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2668  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
2669 }
2670 
2671 template <>
2672 EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2673  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
2674 }
2675 
2676 template <>
2677 EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2678  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
2679 }
2680 
2681 template <>
2682 EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2683  return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
2684 }
2685 
2686 template <>
2687 EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) {
2688  return a;
2689 }
2690 
2691 template <>
2692 EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
2693  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2694  return _mm_xor_si128(a, sign_mask);
2695 }
2696 
2697 template <>
2698 EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2699  return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
2700 }
2701 
2702 template <>
2703 EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2704  return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
2705 }
2706 
2707 template <>
2708 EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2709  return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
2710 }
2711 
2712 template <>
2713 EIGEN_STRONG_INLINE Packet8bf pmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2714  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2715 }
2716 
2717 template <>
2718 EIGEN_STRONG_INLINE Packet8bf pmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2719  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2720 }
2721 
2722 template <>
2723 EIGEN_STRONG_INLINE Packet8bf pnmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2724  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2725 }
2726 
2727 template <>
2728 EIGEN_STRONG_INLINE Packet8bf pnmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2729  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2730 }
2731 
2732 template <>
2733 EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2734  return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
2735 }
2736 
2737 template <>
2738 EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
2739  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
2740  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
2741  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
2742  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
2743  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
2744  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
2745  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
2746  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
2747  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
2748 }
2749 
2750 template <>
2751 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) {
2752  EIGEN_ALIGN32 bfloat16 aux[8];
2753  pstore(aux, from);
2754  to[stride * 0] = aux[0];
2755  to[stride * 1] = aux[1];
2756  to[stride * 2] = aux[2];
2757  to[stride * 3] = aux[3];
2758  to[stride * 4] = aux[4];
2759  to[stride * 5] = aux[5];
2760  to[stride * 6] = aux[6];
2761  to[stride * 7] = aux[7];
2762 }
2763 
2764 template <>
2765 EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
2766  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2767  return _mm_shuffle_epi8(a, m);
2768 }
2769 
2770 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2771  __m128i a = kernel.packet[0];
2772  __m128i b = kernel.packet[1];
2773  __m128i c = kernel.packet[2];
2774  __m128i d = kernel.packet[3];
2775  __m128i e = kernel.packet[4];
2776  __m128i f = kernel.packet[5];
2777  __m128i g = kernel.packet[6];
2778  __m128i h = kernel.packet[7];
2779 
2780  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
2781  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
2782  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
2783  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2784  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
2785  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
2786  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
2787  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2788 
2789  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2790  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2791  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2792  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2793  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2794  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2795  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2796  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2797 
2798  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2799  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2800  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2801  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2802  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2803  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2804  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2805  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2806 }
2807 
2808 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2809  __m128i a = kernel.packet[0];
2810  __m128i b = kernel.packet[1];
2811  __m128i c = kernel.packet[2];
2812  __m128i d = kernel.packet[3];
2813 
2814  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
2815  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
2816  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
2817  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
2818 
2819  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
2820  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
2821  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
2822  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
2823 }
2824 
2825 /*---------------- load/store segment support ----------------*/
2826 
2827 // returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
2828 inline __m128i segment_mask_4x8(Index begin, Index count) {
2829  eigen_assert(begin >= 0 && begin + count <= 4);
2830  long long mask = 1;
2831  mask <<= CHAR_BIT * count;
2832  mask--;
2833  mask <<= CHAR_BIT * begin;
2834 #if !EIGEN_ARCH_x86_64
2835  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
2836 #else
2837  return _mm_cvtsi64_si128(mask);
2838 #endif
2839 }
2840 
2841 // returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
2842 inline __m128i segment_mask_8x8(Index begin, Index count) {
2843  eigen_assert(begin >= 0 && begin + count <= 8);
2844  long long mask = 1;
2845  // avoid UB when count == 8
2846  mask <<= (CHAR_BIT / 2) * count;
2847  mask <<= (CHAR_BIT / 2) * count;
2848  mask--;
2849  mask <<= CHAR_BIT * begin;
2850 #if !EIGEN_ARCH_x86_64
2851  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
2852 #else
2853  return _mm_cvtsi64_si128(mask);
2854 #endif
2855 }
2856 
2857 // returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
2858 inline __m128i segment_mask_4x32(Index begin, Index count) {
2859  eigen_assert(begin >= 0 && begin + count <= 4);
2860  return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
2861 }
2862 
2863 // returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
2864 inline __m128i segment_mask_2x64(Index begin, Index count) {
2865  eigen_assert(begin >= 0 && begin + count <= 2);
2866  return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
2867 }
2868 
2869 // returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
2870 inline __m256i segment_mask_8x32(Index begin, Index count) {
2871  __m128i mask_epi8 = segment_mask_8x8(begin, count);
2872 #ifdef EIGEN_VECTORIZE_AVX2
2873  __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
2874 #else
2875  __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
2876  __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
2877  __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
2878 #endif
2879  return mask_epi32;
2880 }
2881 
2882 // returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
2883 inline __m256i segment_mask_4x64(Index begin, Index count) {
2884  __m128i mask_epi8 = segment_mask_4x8(begin, count);
2885 #ifdef EIGEN_VECTORIZE_AVX2
2886  __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
2887 #else
2888  __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
2889  __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
2890  __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
2891 #endif
2892  return mask_epi64;
2893 }
2894 
2895 /*---------------- float ----------------*/
2896 
2897 template <>
2898 struct has_packet_segment<Packet4f> : std::true_type {};
2899 
2900 template <>
2901 struct has_packet_segment<Packet8f> : std::true_type {};
2902 
2903 template <>
2904 inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
2905  return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
2906 }
2907 
2908 template <>
2909 inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
2910  _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
2911 }
2912 
2913 template <>
2914 inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
2915  return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
2916 }
2917 
2918 template <>
2919 inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
2920  _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
2921 }
2922 
2923 /*---------------- int32 ----------------*/
2924 
2925 template <>
2926 struct has_packet_segment<Packet4i> : std::true_type {};
2927 
2928 template <>
2929 struct has_packet_segment<Packet8i> : std::true_type {};
2930 
2931 #ifdef EIGEN_VECTORIZE_AVX2
2932 
2933 template <>
2934 inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
2935  return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
2936 }
2937 
2938 template <>
2939 inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
2940  _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
2941 }
2942 
2943 template <>
2944 inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
2945  return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
2946 }
2947 
2948 template <>
2949 inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
2950  _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
2951 }
2952 
2953 #else
2954 
2955 template <>
2956 inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
2957  return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
2958 }
2959 
2960 template <>
2961 inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
2962  pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
2963 }
2964 
2965 template <>
2966 inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
2967  return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
2968 }
2969 
2970 template <>
2971 inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
2972  pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
2973 }
2974 
2975 #endif
2976 
2977 /*---------------- uint32 ----------------*/
2978 
2979 template <>
2980 struct has_packet_segment<Packet4ui> : std::true_type {};
2981 
2982 template <>
2983 struct has_packet_segment<Packet8ui> : std::true_type {};
2984 
2985 template <>
2986 inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
2987  return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
2988 }
2989 
2990 template <>
2991 inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
2992  pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
2993 }
2994 
2995 template <>
2996 inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
2997  return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
2998 }
2999 
3000 template <>
3001 inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
3002  pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
3003 }
3004 
3005 /*---------------- double ----------------*/
3006 
3007 template <>
3008 struct has_packet_segment<Packet2d> : std::true_type {};
3009 
3010 template <>
3011 struct has_packet_segment<Packet4d> : std::true_type {};
3012 
3013 template <>
3014 inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
3015  return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
3016 }
3017 
3018 template <>
3019 inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
3020  _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
3021 }
3022 
3023 template <>
3024 inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
3025  return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
3026 }
3027 
3028 template <>
3029 inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
3030  _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
3031 }
3032 
3033 #ifdef EIGEN_VECTORIZE_AVX2
3034 
3035 /*---------------- int64_t ----------------*/
3036 
3037 template <>
3038 struct has_packet_segment<Packet2l> : std::true_type {};
3039 
3040 template <>
3041 struct has_packet_segment<Packet4l> : std::true_type {};
3042 
3043 template <>
3044 inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
3045  return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
3046 }
3047 template <>
3048 inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
3049  _mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
3050 }
3051 template <>
3052 inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
3053  return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
3054 }
3055 template <>
3056 inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
3057  _mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
3058 }
3059 
3060 /*---------------- uint64_t ----------------*/
3061 
3062 template <>
3063 struct has_packet_segment<Packet4ul> : std::true_type {};
3064 
3065 template <>
3066 inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
3067  return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
3068 }
3069 template <>
3070 inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
3071  pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
3072 }
3073 #endif
3074 
3075 /*---------------- end load/store segment support ----------------*/
3076 
3077 } // end namespace internal
3078 
3079 } // end namespace Eigen
3080 
3081 #endif // EIGEN_PACKET_MATH_AVX_H
Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
Definition: Constants.h:238
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82