$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_SSE_H
11 #define EIGEN_PACKET_MATH_SSE_H
12 
13 #include <cstdint>
14 // IWYU pragma: private
15 #include "../../InternalHeaderCheck.h"
16 
17 namespace Eigen {
18 
19 namespace internal {
20 
21 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23 #endif
24 
25 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
26 // 32 bits => 8 registers
27 // 64 bits => 16 registers
28 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
29 #endif
30 
31 #ifdef EIGEN_VECTORIZE_FMA
32 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34 #endif
35 #endif
36 
37 #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
38  (__GXX_ABI_VERSION < 1004)) || \
39  EIGEN_OS_QNX
40 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
41 // have overloads for both types without linking error.
42 // One solution is to increase ABI version using -fabi-version=4 (or greater).
43 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
44 // structure:
45 typedef eigen_packet_wrapper<__m128> Packet4f;
46 typedef eigen_packet_wrapper<__m128d> Packet2d;
47 #else
48 typedef __m128 Packet4f;
49 typedef __m128d Packet2d;
50 #endif
51 
52 typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
53 typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
54 typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
55 typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
56 
57 template <>
58 struct is_arithmetic<__m128> {
59  enum { value = true };
60 };
61 template <>
62 struct is_arithmetic<__m128i> {
63  enum { value = true };
64 };
65 template <>
66 struct is_arithmetic<__m128d> {
67  enum { value = true };
68 };
69 template <>
70 struct is_arithmetic<Packet4i> {
71  enum { value = true };
72 };
73 template <>
74 struct is_arithmetic<Packet2l> {
75  enum { value = true };
76 };
77 // Note that `Packet4ui` uses the underlying type `__m128i`, which is
78 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
79 // operations used in `GenericPacketMath.h`.
80 template <>
81 struct is_arithmetic<Packet4ui> {
82  enum { value = false };
83 };
84 template <>
85 struct is_arithmetic<Packet16b> {
86  enum { value = true };
87 };
88 
89 template <int p, int q, int r, int s>
90 struct shuffle_mask {
91  enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
92 };
93 
94 // TODO: change the implementation of all swizzle* ops from macro to template,
95 #define vec4f_swizzle1(v, p, q, r, s) \
96  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
97 
98 #define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
99 
100 #define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
101 
102 #define vec2d_swizzle1(v, p, q) \
103  Packet2d(_mm_castsi128_pd( \
104  _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
105 
106 #define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
107 
108 #define vec4i_swizzle2(a, b, p, q, r, s) \
109  Packet4i( \
110  _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
111 
112 #define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
113 
114 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
115  return Packet4f(_mm_movelh_ps(a, b));
116 }
117 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
118  return Packet4f(_mm_movehl_ps(a, b));
119 }
120 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
121  return Packet4f(_mm_unpacklo_ps(a, b));
122 }
123 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
124  return Packet4f(_mm_unpackhi_ps(a, b));
125 }
126 #define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
127 
128 #define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
129 
130 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) {
131  return Packet2d(_mm_unpacklo_pd(a, b));
132 }
133 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) {
134  return Packet2d(_mm_unpackhi_pd(a, b));
135 }
136 #define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
137 
138 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
139 
140 #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
141 
142 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
143 
144 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
145 
146 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
147 
148 // Work around lack of extract/cvt for epi64 when compiling for 32-bit.
149 #if EIGEN_ARCH_x86_64
150 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) { return _mm_cvtsi128_si64(a); }
151 #ifdef EIGEN_VECTORIZE_SSE4_1
152 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) { return _mm_extract_epi64(a, 1); }
153 #else
154 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
155  return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
156 }
157 #endif
158 #else
159 // epi64 instructions are not available. The following seems to generate the same instructions
160 // with -O2 in GCC/Clang.
161 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) {
162  return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
163 }
164 EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
165  return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
166 }
167 #endif
168 
169 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
170 // to leverage AVX instructions.
171 #ifndef EIGEN_VECTORIZE_AVX
172 template <>
173 struct packet_traits<float> : default_packet_traits {
174  typedef Packet4f type;
175  typedef Packet4f half;
176  enum {
177  Vectorizable = 1,
178  AlignedOnScalar = 1,
179  size = 4,
180 
181  HasCmp = 1,
182  HasDiv = 1,
183  HasReciprocal = EIGEN_FAST_MATH,
184  HasSin = EIGEN_FAST_MATH,
185  HasCos = EIGEN_FAST_MATH,
186  HasACos = 1,
187  HasASin = 1,
188  HasATan = 1,
189  HasATanh = 1,
190  HasLog = 1,
191  HasLog1p = 1,
192  HasExpm1 = 1,
193  HasNdtri = 1,
194  HasExp = 1,
195  HasPow = 1,
196  HasBessel = 1,
197  HasSqrt = 1,
198  HasRsqrt = 1,
199  HasCbrt = 1,
200  HasTanh = EIGEN_FAST_MATH,
201  HasErf = EIGEN_FAST_MATH,
202  HasErfc = EIGEN_FAST_MATH,
203  HasBlend = 1,
204  HasSign = 0 // The manually vectorized version is slightly slower for SSE.
205  };
206 };
207 template <>
208 struct packet_traits<double> : default_packet_traits {
209  typedef Packet2d type;
210  typedef Packet2d half;
211  enum {
212  Vectorizable = 1,
213  AlignedOnScalar = 1,
214  size = 2,
215 
216  HasCmp = 1,
217  HasDiv = 1,
218  HasSin = EIGEN_FAST_MATH,
219  HasCos = EIGEN_FAST_MATH,
220  HasTanh = EIGEN_FAST_MATH,
221  HasLog = 1,
222  HasErf = EIGEN_FAST_MATH,
223  HasErfc = EIGEN_FAST_MATH,
224  HasExp = 1,
225  HasPow = 1,
226  HasSqrt = 1,
227  HasRsqrt = 1,
228  HasCbrt = 1,
229  HasATan = 1,
230  HasATanh = 1,
231  HasBlend = 1
232  };
233 };
234 template <>
235 struct packet_traits<int> : default_packet_traits {
236  typedef Packet4i type;
237  typedef Packet4i half;
238  enum {
239  Vectorizable = 1,
240  AlignedOnScalar = 1,
241  size = 4,
242 
243  HasCmp = 1,
244  HasDiv = 1,
245  HasShift = 1,
246  HasBlend = 1
247  };
248 };
249 template <>
250 struct packet_traits<uint32_t> : default_packet_traits {
251  typedef Packet4ui type;
252  typedef Packet4ui half;
253  enum {
254  Vectorizable = 1,
255  AlignedOnScalar = 1,
256  size = 4,
257 
258  HasDiv = 0,
259  HasNegate = 0,
260  HasCmp = 1,
261  HasShift = 1,
262  HasBlend = 1
263  };
264 };
265 template <>
266 struct packet_traits<int64_t> : default_packet_traits {
267  typedef Packet2l type;
268  typedef Packet2l half;
269  enum {
270  Vectorizable = 1,
271  AlignedOnScalar = 1,
272  size = 2,
273 
274  HasDiv = 0,
275  HasCmp = 1,
276  HasShift = 1,
277  HasBlend = 1
278  };
279 };
280 #endif
281 template <>
282 struct packet_traits<bool> : default_packet_traits {
283  typedef Packet16b type;
284  typedef Packet16b half;
285  enum {
286  Vectorizable = 1,
287  AlignedOnScalar = 1,
288  size = 16,
289 
290  HasCmp = 1,
291  HasShift = 0,
292  HasAbs = 0,
293  HasAbs2 = 0,
294  HasMin = 0,
295  HasMax = 0,
296  HasConj = 0,
297  HasSqrt = 1,
298  HasNegate = 0,
299  HasSign = 0 // Don't try to vectorize psign<bool> = identity.
300  };
301 };
302 
303 template <>
304 struct unpacket_traits<Packet4f> {
305  typedef float type;
306  typedef Packet4f half;
307  typedef Packet4i integer_packet;
308  enum {
309  size = 4,
310  alignment = Aligned16,
311  vectorizable = true,
312  masked_load_available = false,
313  masked_store_available = false
314  };
315 };
316 template <>
317 struct unpacket_traits<Packet2d> {
318  typedef double type;
319  typedef Packet2d half;
320  typedef Packet2l integer_packet;
321  enum {
322  size = 2,
323  alignment = Aligned16,
324  vectorizable = true,
325  masked_load_available = false,
326  masked_store_available = false
327  };
328 };
329 template <>
330 struct unpacket_traits<Packet2l> {
331  typedef int64_t type;
332  typedef Packet2l half;
333  enum {
334  size = 2,
335  alignment = Aligned16,
336  vectorizable = true,
337  masked_load_available = false,
338  masked_store_available = false
339  };
340 };
341 template <>
342 struct unpacket_traits<Packet4i> {
343  typedef int type;
344  typedef Packet4i half;
345  enum {
346  size = 4,
347  alignment = Aligned16,
348  vectorizable = true,
349  masked_load_available = false,
350  masked_store_available = false
351  };
352 };
353 template <>
354 struct unpacket_traits<Packet4ui> {
355  typedef uint32_t type;
356  typedef Packet4ui half;
357  enum {
358  size = 4,
359  alignment = Aligned16,
360  vectorizable = true,
361  masked_load_available = false,
362  masked_store_available = false
363  };
364 };
365 template <>
366 struct unpacket_traits<Packet16b> {
367  typedef bool type;
368  typedef Packet16b half;
369  enum {
370  size = 16,
371  alignment = Aligned16,
372  vectorizable = true,
373  masked_load_available = false,
374  masked_store_available = false
375  };
376 };
377 
378 #ifndef EIGEN_VECTORIZE_AVX
379 template <>
380 struct scalar_div_cost<float, true> {
381  enum { value = 7 };
382 };
383 template <>
384 struct scalar_div_cost<double, true> {
385  enum { value = 8 };
386 };
387 #endif
388 
389 template <>
390 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
391  return _mm_set_ps1(from);
392 }
393 template <>
394 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
395  return _mm_set1_pd(from);
396 }
397 template <>
398 EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
399  return _mm_set1_epi64x(from);
400 }
401 template <>
402 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
403  return _mm_set1_epi32(from);
404 }
405 template <>
406 EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
407  return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
408 }
409 template <>
410 EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) {
411  return _mm_set1_epi8(static_cast<char>(from));
412 }
413 
414 template <>
415 EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
416  return _mm_castsi128_ps(pset1<Packet4i>(from));
417 }
418 template <>
419 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
420  return _mm_castsi128_pd(_mm_set1_epi64x(from));
421 }
422 
423 template <>
424 EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
425  return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
426 }
427 template <>
428 EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) {
429  return _mm_set_epi32(0, 0, -1, -1);
430 }
431 template <>
432 EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
433  return _mm_set_epi32(0, -1, 0, -1);
434 }
435 template <>
436 EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) {
437  return _mm_set_epi32(0, -1, 0, -1);
438 }
439 template <>
440 EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) {
441  return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
442 }
443 
444 template <>
445 EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
446  return _mm_setzero_ps();
447 }
448 template <>
449 EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
450  return _mm_setzero_pd();
451 }
452 template <>
453 EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) {
454  return _mm_setzero_si128();
455 }
456 template <>
457 EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
458  return _mm_setzero_si128();
459 }
460 template <>
461 EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) {
462  return _mm_setzero_si128();
463 }
464 
465 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
466 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
467 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
468 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
469 // Also note that with AVX, we want it to generate a vbroadcastss.
470 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
471 template <>
472 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
473  return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
474 }
475 #endif
476 
477 template <>
478 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
479  return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
480 }
481 template <>
482 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
483  return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
484 }
485 template <>
486 EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
487  return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
488 }
489 template <>
490 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
491  return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
492 }
493 template <>
494 EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
495  return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
496 }
497 
498 template <>
499 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
500  return _mm_add_ps(a, b);
501 }
502 template <>
503 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
504  return _mm_add_pd(a, b);
505 }
506 template <>
507 EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
508  return _mm_add_epi64(a, b);
509 }
510 template <>
511 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
512  return _mm_add_epi32(a, b);
513 }
514 template <>
515 EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
516  return _mm_add_epi32(a, b);
517 }
518 
519 template <>
520 EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) {
521  return _mm_or_si128(a, b);
522 }
523 
524 template <typename Packet>
525 EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
526 template <>
527 EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) {
528  return _mm_add_ss(a, b);
529 }
530 template <>
531 EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) {
532  return _mm_add_sd(a, b);
533 }
534 
535 template <>
536 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
537  return _mm_sub_ps(a, b);
538 }
539 template <>
540 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
541  return _mm_sub_pd(a, b);
542 }
543 template <>
544 EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
545  return _mm_sub_epi64(a, b);
546 }
547 template <>
548 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
549  return _mm_sub_epi32(a, b);
550 }
551 template <>
552 EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
553  return _mm_sub_epi32(a, b);
554 }
555 template <>
556 EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) {
557  return _mm_xor_si128(a, b);
558 }
559 
560 template <>
561 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
562 template <>
563 EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
564 #ifdef EIGEN_VECTORIZE_SSE3
565  return _mm_addsub_ps(a, b);
566 #else
567  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
568  return padd(a, pxor(mask, b));
569 #endif
570 }
571 
572 template <>
573 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
574 template <>
575 EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
576 #ifdef EIGEN_VECTORIZE_SSE3
577  return _mm_addsub_pd(a, b);
578 #else
579  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
580  return padd(a, pxor(mask, b));
581 #endif
582 }
583 
584 template <>
585 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
586  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
587  return _mm_xor_ps(a, mask);
588 }
589 template <>
590 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
591  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
592  return _mm_xor_pd(a, mask);
593 }
594 template <>
595 EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
596  return psub(pzero(a), a);
597 }
598 
599 template <>
600 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
601  return psub(pzero(a), a);
602 }
603 
604 template <>
605 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
606  return a;
607 }
608 template <>
609 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
610  return a;
611 }
612 template <>
613 EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
614  return a;
615 }
616 template <>
617 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
618  return a;
619 }
620 
621 template <>
622 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
623  return _mm_mul_ps(a, b);
624 }
625 template <>
626 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
627  return _mm_mul_pd(a, b);
628 }
629 template <>
630 EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
631  // 64-bit mul requires avx512, so do this with 32-bit multiplication
632  __m128i upper32_a = _mm_srli_epi64(a, 32);
633  __m128i upper32_b = _mm_srli_epi64(b, 32);
634 
635  // upper * lower
636  __m128i mul1 = _mm_mul_epu32(upper32_a, b);
637  __m128i mul2 = _mm_mul_epu32(upper32_b, a);
638  // Gives us both upper*upper and lower*lower
639  __m128i mul3 = _mm_mul_epu32(a, b);
640 
641  __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
642  return _mm_add_epi64(high, mul3);
643 }
644 template <>
645 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
646 #ifdef EIGEN_VECTORIZE_SSE4_1
647  return _mm_mullo_epi32(a, b);
648 #else
649  // this version is slightly faster than 4 scalar products
650  return vec4i_swizzle1(
651  vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
652  0, 2, 0, 2),
653  0, 2, 1, 3);
654 #endif
655 }
656 template <>
657 EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
658 #ifdef EIGEN_VECTORIZE_SSE4_1
659  return _mm_mullo_epi32(a, b);
660 #else
661  // this version is slightly faster than 4 scalar products
662  return vec4ui_swizzle1(
663  vec4ui_swizzle2(_mm_mul_epu32(a, b),
664  _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
665  0, 2, 1, 3);
666 #endif
667 }
668 
669 template <>
670 EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) {
671  return _mm_and_si128(a, b);
672 }
673 
674 template <>
675 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
676  return _mm_div_ps(a, b);
677 }
678 template <>
679 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
680  return _mm_div_pd(a, b);
681 }
682 
683 template <>
684 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
685 #ifdef EIGEN_VECTORIZE_AVX
686  return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
687 #else
688  __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
689  __m128i q_hi = _mm_cvttpd_epi32(
690  _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
691  return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
692 #endif
693 }
694 
695 #ifdef EIGEN_VECTORIZE_FMA
696 template <>
697 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
698  return _mm_fmadd_ps(a, b, c);
699 }
700 template <>
701 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
702  return _mm_fmadd_pd(a, b, c);
703 }
704 template <>
705 EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
706  return _mm_fmsub_ps(a, b, c);
707 }
708 template <>
709 EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
710  return _mm_fmsub_pd(a, b, c);
711 }
712 template <>
713 EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
714  return _mm_fnmadd_ps(a, b, c);
715 }
716 template <>
717 EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
718  return _mm_fnmadd_pd(a, b, c);
719 }
720 template <>
721 EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
722  return _mm_fnmsub_ps(a, b, c);
723 }
724 template <>
725 EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
726  return _mm_fnmsub_pd(a, b, c);
727 }
728 
729 template <typename Packet>
730 EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
731 template <>
732 EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
733  return _mm_fmadd_ss(a, b, c);
734 }
735 template <>
736 EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
737  return _mm_fmadd_sd(a, b, c);
738 }
739 #endif
740 
741 #ifdef EIGEN_VECTORIZE_SSE4_1
742 template <>
743 EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
744  return _mm_blendv_ps(b, a, mask);
745 }
746 
747 template <>
748 EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
749  return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
750 }
751 
752 template <>
753 EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
754  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
755 }
756 
757 template <>
758 EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
759  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
760 }
761 
762 template <>
763 EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
764  return _mm_blendv_pd(b, a, mask);
765 }
766 #endif
767 
768 template <>
769 EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(const Packet2l& a) {
770  return _mm_cmpeq_epi32(a, a);
771 }
772 template <>
773 EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
774  return _mm_cmpeq_epi32(a, a);
775 }
776 template <>
777 EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& /*a*/) {
778  return pset1<Packet16b>(true);
779 }
780 template <>
781 EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(const Packet4f& a) {
782  Packet4i b = _mm_castps_si128(a);
783  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
784 }
785 template <>
786 EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(const Packet2d& a) {
787  Packet4i b = _mm_castpd_si128(a);
788  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
789 }
790 
791 template <>
792 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
793  return _mm_and_ps(a, b);
794 }
795 template <>
796 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
797  return _mm_and_pd(a, b);
798 }
799 template <>
800 EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
801  return _mm_and_si128(a, b);
802 }
803 template <>
804 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
805  return _mm_and_si128(a, b);
806 }
807 template <>
808 EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
809  return _mm_and_si128(a, b);
810 }
811 template <>
812 EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) {
813  return _mm_and_si128(a, b);
814 }
815 
816 template <>
817 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
818  return _mm_or_ps(a, b);
819 }
820 template <>
821 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
822  return _mm_or_pd(a, b);
823 }
824 template <>
825 EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
826  return _mm_or_si128(a, b);
827 }
828 template <>
829 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
830  return _mm_or_si128(a, b);
831 }
832 template <>
833 EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
834  return _mm_or_si128(a, b);
835 }
836 template <>
837 EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) {
838  return _mm_or_si128(a, b);
839 }
840 
841 template <>
842 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
843  return _mm_xor_ps(a, b);
844 }
845 template <>
846 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
847  return _mm_xor_pd(a, b);
848 }
849 template <>
850 EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
851  return _mm_xor_si128(a, b);
852 }
853 template <>
854 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
855  return _mm_xor_si128(a, b);
856 }
857 template <>
858 EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
859  return _mm_xor_si128(a, b);
860 }
861 template <>
862 EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) {
863  return _mm_xor_si128(a, b);
864 }
865 
866 template <>
867 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
868  return _mm_andnot_ps(b, a);
869 }
870 template <>
871 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
872  return _mm_andnot_pd(b, a);
873 }
874 template <>
875 EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
876  return _mm_andnot_si128(b, a);
877 }
878 template <>
879 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
880  return _mm_andnot_si128(b, a);
881 }
882 template <>
883 EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
884  return _mm_andnot_si128(b, a);
885 }
886 template <>
887 EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(const Packet16b& a, const Packet16b& b) {
888  return _mm_andnot_si128(b, a);
889 }
890 template <>
891 EIGEN_STRONG_INLINE Packet16b pcmp_lt(const Packet16b& a, const Packet16b& b) {
892  return _mm_andnot_si128(a, b);
893 }
894 template <>
895 EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
896  return _mm_cmple_ps(a, b);
897 }
898 template <>
899 EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
900  return _mm_cmplt_ps(a, b);
901 }
902 template <>
903 EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
904  return _mm_cmpnge_ps(a, b);
905 }
906 template <>
907 EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
908  return _mm_cmpeq_ps(a, b);
909 }
910 
911 template <>
912 EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
913  return _mm_cmple_pd(a, b);
914 }
915 template <>
916 EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
917  return _mm_cmplt_pd(a, b);
918 }
919 template <>
920 EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
921  return _mm_cmpnge_pd(a, b);
922 }
923 template <>
924 EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
925  return _mm_cmpeq_pd(a, b);
926 }
927 template <>
928 EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
929  return _mm_cmplt_epi32(a, b);
930 }
931 template <>
932 EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
933  return _mm_cmpeq_epi32(a, b);
934 }
935 template <>
936 EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
937 #ifdef EIGEN_VECTORIZE_SSE4_1
938  return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
939 #else
940  return por(pcmp_lt(a, b), pcmp_eq(a, b));
941 #endif
942 }
943 template <>
944 EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
945 #ifdef EIGEN_VECTORIZE_SSE4_2
946  return _mm_cmpgt_epi64(b, a);
947 #else
948  Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
949  Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
950  Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
951  Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
952  Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
953  // return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
954  return por(hi_lt, pand(hi_eq, lo_lt));
955 #endif
956 }
957 template <>
958 EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
959 #ifdef EIGEN_VECTORIZE_SSE4_1
960  return _mm_cmpeq_epi64(a, b);
961 #else
962  Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
963  return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
964 #endif
965 }
966 template <>
967 EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) {
968  return por(pcmp_lt(a, b), pcmp_eq(a, b));
969 }
970 template <>
971 EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
972  // Mask out invalid bool bits to avoid UB.
973  const Packet16b kBoolMask = pset1<Packet16b>(true);
974  return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
975 }
976 template <>
977 EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
978  return _mm_cmpeq_epi32(a, b);
979 }
980 
981 template <>
982 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
983 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
984 // There appears to be a bug in GCC, by which the optimizer may
985 // flip the argument order in calls to _mm_min_ps, so we have to
986 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
987 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
988 #ifdef EIGEN_VECTORIZE_AVX
989  Packet4f res;
990  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
991 #else
992  Packet4f res = b;
993  asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
994 #endif
995  return res;
996 #else
997  // Arguments are reversed to match NaN propagation behavior of std::min.
998  return _mm_min_ps(b, a);
999 #endif
1000 }
1001 template <>
1002 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
1003 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1004 // There appears to be a bug in GCC, by which the optimizer may
1005 // flip the argument order in calls to _mm_min_pd, so we have to
1006 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1007 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1008 #ifdef EIGEN_VECTORIZE_AVX
1009  Packet2d res;
1010  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1011 #else
1012  Packet2d res = b;
1013  asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1014 #endif
1015  return res;
1016 #else
1017  // Arguments are reversed to match NaN propagation behavior of std::min.
1018  return _mm_min_pd(b, a);
1019 #endif
1020 }
1021 template <>
1022 EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1023  Packet2l a_lt_mask = pcmp_lt(a, b);
1024  return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
1025 }
1026 template <>
1027 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1028 #ifdef EIGEN_VECTORIZE_SSE4_1
1029  return _mm_min_epi32(a, b);
1030 #else
1031  // after some bench, this version *is* faster than a scalar implementation
1032  Packet4i mask = _mm_cmplt_epi32(a, b);
1033  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1034 #endif
1035 }
1036 template <>
1037 EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1038 #ifdef EIGEN_VECTORIZE_SSE4_1
1039  return _mm_min_epu32(a, b);
1040 #else
1041  return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1042  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1043  pset1<Packet4ui>(0x80000000UL));
1044 #endif
1045 }
1046 
1047 template <>
1048 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1049 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1050 // There appears to be a bug in GCC, by which the optimizer may
1051 // flip the argument order in calls to _mm_max_ps, so we have to
1052 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1053 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1054 #ifdef EIGEN_VECTORIZE_AVX
1055  Packet4f res;
1056  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1057 #else
1058  Packet4f res = b;
1059  asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1060 #endif
1061  return res;
1062 #else
1063  // Arguments are reversed to match NaN propagation behavior of std::max.
1064  return _mm_max_ps(b, a);
1065 #endif
1066 }
1067 template <>
1068 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
1069 #if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1070 // There appears to be a bug in GCC, by which the optimizer may
1071 // flip the argument order in calls to _mm_max_pd, so we have to
1072 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1073 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1074 #ifdef EIGEN_VECTORIZE_AVX
1075  Packet2d res;
1076  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1077 #else
1078  Packet2d res = b;
1079  asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1080 #endif
1081  return res;
1082 #else
1083  // Arguments are reversed to match NaN propagation behavior of std::max.
1084  return _mm_max_pd(b, a);
1085 #endif
1086 }
1087 template <>
1088 EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1089  Packet2l a_lt_mask = pcmp_lt(a, b);
1090  return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
1091 }
1092 template <>
1093 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1094 #ifdef EIGEN_VECTORIZE_SSE4_1
1095  return _mm_max_epi32(a, b);
1096 #else
1097  // after some bench, this version *is* faster than a scalar implementation
1098  Packet4i mask = _mm_cmpgt_epi32(a, b);
1099  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1100 #endif
1101 }
1102 template <>
1103 EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1104 #ifdef EIGEN_VECTORIZE_SSE4_1
1105  return _mm_max_epu32(a, b);
1106 #else
1107  return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1108  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1109  pset1<Packet4ui>(0x80000000UL));
1110 #endif
1111 }
1112 
1113 template <>
1114 EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
1115 #ifdef EIGEN_VECTORIZE_SSE4_1
1116  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1117 #else
1118  return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1119  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1120 #endif
1121 }
1122 template <>
1123 EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
1124 #ifdef EIGEN_VECTORIZE_SSE4_1
1125  return pcmp_eq(a, pmin(a, b));
1126 #else
1127  return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1128  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1129 #endif
1130 }
1131 
1132 template <typename Packet, typename Op>
1133 EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
1134  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1135  // always return a if either a or b is NaN.
1136  Packet not_nan_mask_a = pcmp_eq(a, a);
1137  Packet m = op(a, b);
1138  return pselect<Packet>(not_nan_mask_a, m, b);
1139 }
1140 
1141 template <typename Packet, typename Op>
1142 EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
1143  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1144  // always return a if either a or b is NaN.
1145  Packet not_nan_mask_a = pcmp_eq(a, a);
1146  Packet m = op(b, a);
1147  return pselect<Packet>(not_nan_mask_a, m, a);
1148 }
1149 
1150 // Add specializations for min/max with prescribed NaN propagation.
1151 template <>
1152 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1153  return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
1154 }
1155 template <>
1156 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
1157  return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
1158 }
1159 template <>
1160 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1161  return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
1162 }
1163 template <>
1164 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
1165  return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
1166 }
1167 template <>
1168 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1169  return pminmax_propagate_nan(a, b, pmin<Packet4f>);
1170 }
1171 template <>
1172 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
1173  return pminmax_propagate_nan(a, b, pmin<Packet2d>);
1174 }
1175 template <>
1176 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1177  return pminmax_propagate_nan(a, b, pmax<Packet4f>);
1178 }
1179 template <>
1180 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
1181  return pminmax_propagate_nan(a, b, pmax<Packet2d>);
1182 }
1183 
1184 template <>
1185 EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1186  return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1187 }
1188 template <>
1189 EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
1190  Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
1191 #ifdef EIGEN_VECTORIZE_AVX
1192  return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1193 #else
1194  return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1195 #endif // EIGEN_VECTORIZE_AVX
1196 }
1197 template <>
1198 EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) {
1199  return _mm_srai_epi32(a, 31);
1200 }
1201 template <>
1202 EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
1203  return pzero(a);
1204 }
1205 template <>
1206 EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) {
1207  Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
1208  return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1209 }
1210 
1211 template <int N>
1212 EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
1213  Packet2l signbit = psignbit(a);
1214  return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
1215 }
1216 template <int N>
1217 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
1218  return _mm_srli_epi64(a, N);
1219 }
1220 template <int N>
1221 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
1222  return _mm_slli_epi64(a, N);
1223 }
1224 template <int N>
1225 EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1226  return _mm_srai_epi32(a, N);
1227 }
1228 template <int N>
1229 EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1230  return _mm_srli_epi32(a, N);
1231 }
1232 template <int N>
1233 EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1234  return _mm_slli_epi32(a, N);
1235 }
1236 template <int N>
1237 EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
1238  return _mm_srli_epi32(a, N);
1239 }
1240 template <int N>
1241 EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1242  return _mm_srli_epi32(a, N);
1243 }
1244 template <int N>
1245 EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1246  return _mm_slli_epi32(a, N);
1247 }
1248 
1249 template <>
1250 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1251  const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
1252  return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
1253 }
1254 template <>
1255 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1256  const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
1257  return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
1258 }
1259 template <>
1260 EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
1261  Packet2l signbit = psignbit(a);
1262  return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
1263 }
1264 template <>
1265 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1266 #ifdef EIGEN_VECTORIZE_SSSE3
1267  return _mm_abs_epi32(a);
1268 #else
1269  Packet4i signbit = psignbit(a);
1270  return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
1271 #endif
1272 }
1273 template <>
1274 EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
1275  return a;
1276 }
1277 
1278 #ifdef EIGEN_VECTORIZE_SSE4_1
1279 template <>
1280 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1281  // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
1282  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
1283  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
1284  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1285 }
1286 
1287 template <>
1288 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
1289  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
1290  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
1291  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1292 }
1293 
1294 template <>
1295 EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1296  return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1297 }
1298 template <>
1299 EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
1300  return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1301 }
1302 
1303 template <>
1304 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1305  return _mm_ceil_ps(a);
1306 }
1307 template <>
1308 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
1309  return _mm_ceil_pd(a);
1310 }
1311 
1312 template <>
1313 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1314  return _mm_floor_ps(a);
1315 }
1316 template <>
1317 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
1318  return _mm_floor_pd(a);
1319 }
1320 
1321 template <>
1322 EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1323  return _mm_round_ps(a, _MM_FROUND_TRUNC);
1324 }
1325 template <>
1326 EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
1327  return _mm_round_pd(a, _MM_FROUND_TRUNC);
1328 }
1329 #endif
1330 
1331 template <>
1332 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1333  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
1334 }
1335 template <>
1336 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1337  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
1338 }
1339 template <>
1340 EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
1341  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1342 }
1343 template <>
1344 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
1345  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1346 }
1347 template <>
1348 EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
1349  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1350 }
1351 template <>
1352 EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) {
1353  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1354 }
1355 
1356 #if EIGEN_COMP_MSVC
1357 template <>
1358 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1359  EIGEN_DEBUG_UNALIGNED_LOAD
1360  return _mm_loadu_ps(from);
1361 }
1362 #else
1363 // NOTE: with the code below, MSVC's compiler crashes!
1364 
1365 template <>
1366 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1367  EIGEN_DEBUG_UNALIGNED_LOAD
1368  return _mm_loadu_ps(from);
1369 }
1370 #endif
1371 
1372 template <>
1373 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1374  EIGEN_DEBUG_UNALIGNED_LOAD
1375  return _mm_loadu_pd(from);
1376 }
1377 template <>
1378 EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
1379  EIGEN_DEBUG_UNALIGNED_LOAD
1380  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1381 }
1382 template <>
1383 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1384  EIGEN_DEBUG_UNALIGNED_LOAD
1385  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1386 }
1387 template <>
1388 EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
1389  EIGEN_DEBUG_UNALIGNED_LOAD
1390  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1391 }
1392 template <>
1393 EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
1394  EIGEN_DEBUG_UNALIGNED_LOAD
1395  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1396 }
1397 
1398 // Load lower part of packet zero extending.
1399 template <typename Packet>
1400 EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
1401 template <>
1402 EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
1403  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
1404 }
1405 template <>
1406 EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
1407  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1408 }
1409 
1410 // Load scalar
1411 template <typename Packet>
1412 EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
1413 template <>
1414 EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) {
1415  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
1416 }
1417 template <>
1418 EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
1419  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1420 }
1421 
1422 template <>
1423 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1424  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
1425 }
1426 template <>
1427 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1428  return pset1<Packet2d>(from[0]);
1429 }
1430 template <>
1431 EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
1432  return pset1<Packet2l>(from[0]);
1433 }
1434 template <>
1435 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1436  Packet4i tmp;
1437  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1438  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
1439 }
1440 template <>
1441 EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
1442  Packet4ui tmp;
1443  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1444  return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
1445 }
1446 
1447 // Loads 8 bools from memory and returns the packet
1448 // {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
1449 template <>
1450 EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
1451  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
1452  return _mm_unpacklo_epi8(tmp, tmp);
1453 }
1454 
1455 // Loads 4 bools from memory and returns the packet
1456 // {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
1457 template <>
1458 EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
1459  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
1460  tmp = _mm_unpacklo_epi8(tmp, tmp);
1461  return _mm_unpacklo_epi16(tmp, tmp);
1462 }
1463 
1464 template <>
1465 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1466  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
1467 }
1468 template <>
1469 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1470  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
1471 }
1472 template <>
1473 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1474  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1475 }
1476 template <>
1477 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
1478  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1479 }
1480 template <>
1481 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1482  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1483 }
1484 template <>
1485 EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
1486  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1487 }
1488 
1489 template <>
1490 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1491  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
1492 }
1493 template <>
1494 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1495  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
1496 }
1497 template <>
1498 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1499  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1500 }
1501 template <>
1502 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1503  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1504 }
1505 template <>
1506 EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
1507  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1508 }
1509 template <>
1510 EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
1511  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1512 }
1513 
1514 template <typename Scalar, typename Packet>
1515 EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
1516 template <>
1517 EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
1518  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
1519 }
1520 template <>
1521 EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
1522  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
1523 }
1524 
1525 template <typename Scalar, typename Packet>
1526 EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
1527 template <>
1528 EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
1529  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
1530 }
1531 template <>
1532 EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
1533  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
1534 }
1535 
1536 template <>
1537 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1538  return _mm_shuffle_ps(a, a, 0x1B);
1539 }
1540 template <>
1541 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1542  return _mm_shuffle_pd(a, a, 0x1);
1543 }
1544 template <>
1545 EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
1546  return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
1547 }
1548 template <>
1549 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1550  return _mm_shuffle_epi32(a, 0x1B);
1551 }
1552 template <>
1553 EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
1554  return _mm_shuffle_epi32(a, 0x1B);
1555 }
1556 template <>
1557 EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
1558 #ifdef EIGEN_VECTORIZE_SSSE3
1559  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1560  return _mm_shuffle_epi8(a, mask);
1561 #else
1562  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1563  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1564  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1565 #endif
1566 }
1567 
1568 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1569 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1570 // Direct of the struct members fixed bug #62.
1571 template <>
1572 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1573  return a.m128_f32[0];
1574 }
1575 template <>
1576 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1577  return a.m128d_f64[0];
1578 }
1579 template <>
1580 EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1581  int64_t x = _mm_extract_epi64_0(a);
1582  return x;
1583 }
1584 template <>
1585 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1586  int x = _mm_cvtsi128_si32(a);
1587  return x;
1588 }
1589 template <>
1590 EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1591  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1592  return x;
1593 }
1594 #elif EIGEN_COMP_MSVC_STRICT
1595 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1596 template <>
1597 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1598  float x = _mm_cvtss_f32(a);
1599  return x;
1600 }
1601 template <>
1602 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1603  double x = _mm_cvtsd_f64(a);
1604  return x;
1605 }
1606 template <>
1607 EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1608  int64_t x = _mm_extract_epi64_0(a);
1609  return x;
1610 }
1611 template <>
1612 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1613  int x = _mm_cvtsi128_si32(a);
1614  return x;
1615 }
1616 template <>
1617 EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1618  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1619  return x;
1620 }
1621 #else
1622 template <>
1623 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1624  return _mm_cvtss_f32(a);
1625 }
1626 template <>
1627 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1628  return _mm_cvtsd_f64(a);
1629 }
1630 template <>
1631 EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1632  return _mm_extract_epi64_0(a);
1633 }
1634 template <>
1635 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1636  return _mm_cvtsi128_si32(a);
1637 }
1638 template <>
1639 EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1640  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1641 }
1642 #endif
1643 template <>
1644 EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
1645  int x = _mm_cvtsi128_si32(a);
1646  return static_cast<bool>(x & 1);
1647 }
1648 
1649 template <>
1650 EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1651  return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1652 }
1653 template <>
1654 EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1655  return _mm_set_pd(from[1 * stride], from[0 * stride]);
1656 }
1657 template <>
1658 EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
1659  return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
1660 }
1661 template <>
1662 EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
1663  return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1664 }
1665 template <>
1666 EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
1667  return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
1668  numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
1669 }
1670 
1671 template <>
1672 EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
1673  return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
1674  from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
1675  from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
1676  from[0 * stride]);
1677 }
1678 
1679 template <>
1680 EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1681  to[stride * 0] = pfirst(from);
1682  to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
1683  to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
1684  to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
1685 }
1686 template <>
1687 EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1688  to[stride * 0] = pfirst(from);
1689  to[stride * 1] = pfirst(preverse(from));
1690 }
1691 template <>
1692 EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
1693  to[stride * 0] = pfirst(from);
1694  to[stride * 1] = pfirst(preverse(from));
1695 }
1696 template <>
1697 EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
1698  to[stride * 0] = _mm_cvtsi128_si32(from);
1699  to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1700  to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1701  to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1702 }
1703 template <>
1704 EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
1705  to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1706  to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1707  to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1708  to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1709 }
1710 template <>
1711 EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
1712  EIGEN_ALIGN16 bool tmp[16];
1713  pstore(tmp, from);
1714  to[stride * 0] = tmp[0];
1715  to[stride * 1] = tmp[1];
1716  to[stride * 2] = tmp[2];
1717  to[stride * 3] = tmp[3];
1718  to[stride * 4] = tmp[4];
1719  to[stride * 5] = tmp[5];
1720  to[stride * 6] = tmp[6];
1721  to[stride * 7] = tmp[7];
1722  to[stride * 8] = tmp[8];
1723  to[stride * 9] = tmp[9];
1724  to[stride * 10] = tmp[10];
1725  to[stride * 11] = tmp[11];
1726  to[stride * 12] = tmp[12];
1727  to[stride * 13] = tmp[13];
1728  to[stride * 14] = tmp[14];
1729  to[stride * 15] = tmp[15];
1730 }
1731 
1732 // some compilers might be tempted to perform multiple moves instead of using a vector path.
1733 template <>
1734 EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
1735  Packet4f pa = _mm_set_ss(a);
1736  pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
1737 }
1738 // some compilers might be tempted to perform multiple moves instead of using a vector path.
1739 template <>
1740 EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
1741  Packet2d pa = _mm_set_sd(a);
1742  pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
1743 }
1744 
1745 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1746 typedef const void* SsePrefetchPtrType;
1747 #else
1748 typedef const char* SsePrefetchPtrType;
1749 #endif
1750 
1751 #ifndef EIGEN_VECTORIZE_AVX
1752 template <>
1753 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1754  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1755 }
1756 template <>
1757 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1758  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1759 }
1760 template <>
1761 EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1762  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1763 }
1764 template <>
1765 EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1766  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1767 }
1768 template <>
1769 EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1770  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1771 }
1772 #endif
1773 
1774 template <>
1775 EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1776  return pfrexp_generic(a, exponent);
1777 }
1778 
1779 // Extract exponent without existence of Packet2l.
1780 template <>
1781 EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
1782  const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1783  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1784  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1785 }
1786 
1787 template <>
1788 EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
1789  return pfrexp_generic(a, exponent);
1790 }
1791 
1792 template <>
1793 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1794  return pldexp_generic(a, exponent);
1795 }
1796 
1797 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1798 // supported by SSE, and has more range than is needed for exponents.
1799 template <>
1800 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1801  // Clamp exponent to [-2099, 2099]
1802  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1803  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1804 
1805  // Convert e to integer and swizzle to low-order bits.
1806  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1807 
1808  // Split 2^e into four factors and multiply:
1809  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1810  Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
1811  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b
1812  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1813  b = psub(psub(psub(ei, b), b), b); // e - 3b
1814  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
1815  out = pmul(out, c); // a * 2^e
1816  return out;
1817 }
1818 
1819 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1820 // supported by SSE, and has more range than is needed for exponents.
1821 template <>
1822 EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1823  // Clamp exponent to [-1023, 1024]
1824  const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
1825  const Packet2d max_exponent = pset1<Packet2d>(1024.0);
1826  const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
1827 
1828  // Convert e to integer and swizzle to low-order bits.
1829  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1830 
1831  // Compute 2^e multiply:
1832  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1833  const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52)); // 2^e
1834  return pmul(a, c);
1835 }
1836 
1837 // with AVX, the default implementations based on pload1 are faster
1838 #ifndef __AVX__
1839 template <>
1840 EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1841  a3 = pload<Packet4f>(a);
1842  a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
1843  a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
1844  a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
1845  a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
1846 }
1847 template <>
1848 EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
1849  Packet2d& a3) {
1850 #ifdef EIGEN_VECTORIZE_SSE3
1851  a0 = _mm_loaddup_pd(a + 0);
1852  a1 = _mm_loaddup_pd(a + 1);
1853  a2 = _mm_loaddup_pd(a + 2);
1854  a3 = _mm_loaddup_pd(a + 3);
1855 #else
1856  a1 = pload<Packet2d>(a);
1857  a0 = vec2d_swizzle1(a1, 0, 0);
1858  a1 = vec2d_swizzle1(a1, 1, 1);
1859  a3 = pload<Packet2d>(a + 2);
1860  a2 = vec2d_swizzle1(a3, 0, 0);
1861  a3 = vec2d_swizzle1(a3, 1, 1);
1862 #endif
1863 }
1864 #endif
1865 
1866 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
1867  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1868  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1869  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1870  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1871 }
1872 
1873 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1874  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1875 }
1876 
1877 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1878  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1879  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1880  kernel.packet[1] = tmp;
1881 }
1882 
1883 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
1884  __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1885  kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1886  kernel.packet[1] = tmp;
1887 }
1888 
1889 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
1890  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1891  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1892  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1893  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1894 
1895  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1896  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1897  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1898  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1899 }
1900 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
1901  ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1902 }
1903 
1904 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
1905  __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1906  __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1907  __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1908  __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1909  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1910  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1911  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1912  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1913 }
1914 
1915 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
1916  // If we number the elements in the input thus:
1917  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
1918  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
1919  // ...
1920  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
1921  //
1922  // the desired output is:
1923  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
1924  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
1925  // ...
1926  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
1927  __m128i t0 =
1928  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1929  __m128i t1 =
1930  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
1931  __m128i t2 =
1932  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
1933  __m128i t3 =
1934  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
1935  __m128i t4 =
1936  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
1937  __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
1938  __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1939  __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1940  __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1941  __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1942  __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1943  __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1944  __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1945  __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1946  __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1947  __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1948 
1949  __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1950  __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
1951  __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
1952  __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
1953  __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1954  __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
1955  __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1956  __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1957  __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1958  __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1959  __m128i sa = _mm_unpacklo_epi16(t9, tb);
1960  __m128i sb = _mm_unpackhi_epi16(t9, tb);
1961  __m128i sc = _mm_unpacklo_epi16(tc, te);
1962  __m128i sd = _mm_unpackhi_epi16(tc, te);
1963  __m128i se = _mm_unpacklo_epi16(td, tf);
1964  __m128i sf = _mm_unpackhi_epi16(td, tf);
1965 
1966  __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1967  __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1968  __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1969  __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1970  __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1971  __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1972  __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1973  __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1974  __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1975  __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1976  __m128i ua = _mm_unpacklo_epi32(s9, sd);
1977  __m128i ub = _mm_unpackhi_epi32(s9, sd);
1978  __m128i uc = _mm_unpacklo_epi32(sa, se);
1979  __m128i ud = _mm_unpackhi_epi32(sa, se);
1980  __m128i ue = _mm_unpacklo_epi32(sb, sf);
1981  __m128i uf = _mm_unpackhi_epi32(sb, sf);
1982 
1983  kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1984  kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1985  kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1986  kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1987  kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1988  kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1989  kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1990  kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1991  kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1992  kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1993  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1994  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1995  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1996  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1997  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1998  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
1999 }
2000 
2001 EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
2002  return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2003 }
2004 
2005 EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
2006  return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2007 }
2008 
2009 template <>
2010 EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
2011  const Packet2l& elsePacket) {
2012  const __m128i true_mask = sse_blend_mask(ifPacket);
2013  return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
2014 }
2015 template <>
2016 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
2017  const Packet4i& elsePacket) {
2018  const __m128i true_mask = sse_blend_mask(ifPacket);
2019  return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
2020 }
2021 template <>
2022 EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
2023  const Packet4ui& elsePacket) {
2024  return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
2025 }
2026 template <>
2027 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
2028  const Packet4f& elsePacket) {
2029  const __m128i true_mask = sse_blend_mask(ifPacket);
2030  return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
2031 }
2032 template <>
2033 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
2034  const Packet2d& elsePacket) {
2035  const __m128i true_mask = sse_blend_mask(ifPacket);
2036  return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
2037 }
2038 
2039 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
2040 #if defined(EIGEN_VECTORIZE_FMA)
2041 template <>
2042 EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
2043  return std::fmaf(a, b, c);
2044 }
2045 template <>
2046 EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
2047  return std::fma(a, b, c);
2048 }
2049 template <>
2050 EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
2051  return std::fmaf(a, b, -c);
2052 }
2053 template <>
2054 EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
2055  return std::fma(a, b, -c);
2056 }
2057 template <>
2058 EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
2059  return std::fmaf(-a, b, c);
2060 }
2061 template <>
2062 EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
2063  return std::fma(-a, b, c);
2064 }
2065 template <>
2066 EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
2067  return std::fmaf(-a, b, -c);
2068 }
2069 template <>
2070 EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
2071  return std::fma(-a, b, -c);
2072 }
2073 #endif
2074 
2075 #ifdef EIGEN_VECTORIZE_SSE4_1
2076 // Helpers for half->float and float->half conversions.
2077 // Currently only used by the AVX code.
2078 EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
2079  __m128i input = _mm_cvtepu16_epi32(h);
2080 
2081  // Direct vectorization of half_to_float, C parts in the comments.
2082  __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
2083  // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
2084  __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
2085  // exp = shifted_exp & o.u; // just the exponent
2086  __m128i exp = _mm_and_si128(ou, shifted_exp);
2087  // o.u += (127 - 15) << 23;
2088  ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
2089 
2090  // Inf/NaN?
2091  __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
2092  // Inf/NaN adjust
2093  __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
2094  // extra exp adjust for Inf/NaN
2095  ou = _mm_add_epi32(ou, naninf_adj);
2096 
2097  // Zero/Denormal?
2098  __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128());
2099  __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
2100  // o.u += 1 << 23;
2101  ou = _mm_add_epi32(ou, zeroden_adj);
2102  // magic.u = 113 << 23
2103  __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
2104  // o.f -= magic.f
2105  ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
2106 
2107  __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
2108  // o.u |= (h.x & 0x8000) << 16; // sign bit
2109  ou = _mm_or_si128(ou, sign);
2110  // return o.f;
2111  // We are actually returning uint version, to make
2112  // _mm256_insertf128_si256 work.
2113  return ou;
2114 }
2115 
2116 EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
2117  // unsigned int sign_mask = 0x80000000u;
2118  __m128i sign = _mm_set1_epi32(0x80000000u);
2119  // unsigned int sign = f.u & sign_mask;
2120  sign = _mm_and_si128(sign, _mm_castps_si128(f));
2121  // f.u ^= sign;
2122  f = _mm_xor_ps(f, _mm_castsi128_ps(sign));
2123 
2124  __m128i fu = _mm_castps_si128(f);
2125 
2126  __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
2127  __m128i f32infty = _mm_set1_epi32(255 << 23);
2128  // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2129  // there is no _mm_cmpge_epi32, so use lt and swap operands
2130  __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
2131  __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
2132  __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
2133  __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
2134  __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
2135  // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2136  __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
2137 
2138  __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
2139  __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
2140  // f.f += denorm_magic.f;
2141  f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
2142  // f.u - denorm_magic.u
2143  __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
2144  o = _mm_and_si128(o, subnorm_mask);
2145  // Correct result for inf/nan/zero/subnormal, 0 otherwise
2146  o = _mm_or_si128(o, naninf_value);
2147 
2148  __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
2149  o = _mm_and_si128(o, mask);
2150 
2151  // mant_odd = (f.u >> 13) & 1;
2152  __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
2153  // f.u += 0xc8000fffU;
2154  fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
2155  // f.u += mant_odd;
2156  fu = _mm_add_epi32(fu, mand_odd);
2157  fu = _mm_andnot_si128(mask, fu);
2158  // f.u >> 13
2159  fu = _mm_srli_epi32(fu, 13);
2160  o = _mm_or_si128(fu, o);
2161 
2162  // o.x |= static_cast<numext::uint16_t>(sign >> 16);
2163  o = _mm_or_si128(o, _mm_srli_epi32(sign, 16));
2164 
2165  // 16 bit values
2166  return _mm_and_si128(o, _mm_set1_epi32(0xffff));
2167 }
2168 #endif
2169 
2170 // Packet math for Eigen::half
2171 // Disable the following code since it's broken on too many platforms / compilers.
2172 // #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
2173 #if 0
2174 
2175 typedef struct {
2176  __m64 x;
2177 } Packet4h;
2178 
2179 
2180 template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
2181 
2182 template <>
2183 struct packet_traits<Eigen::half> : default_packet_traits {
2184  typedef Packet4h type;
2185  // There is no half-size packet for Packet4h.
2186  typedef Packet4h half;
2187  enum {
2188  Vectorizable = 1,
2189  AlignedOnScalar = 1,
2190  size = 4,
2191  HasAdd = 1,
2192  HasSub = 1,
2193  HasMul = 1,
2194  HasDiv = 1,
2195  HasNegate = 0,
2196  HasAbs = 0,
2197  HasAbs2 = 0,
2198  HasMin = 0,
2199  HasMax = 0,
2200  HasConj = 0,
2201  HasSetLinear = 0,
2202  };
2203 };
2204 
2205 
2206 template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
2207 
2208 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
2209  Packet4h result;
2210  result.x = _mm_set1_pi16(from.x);
2211  return result;
2212 }
2213 
2214 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
2215  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
2216 }
2217 
2218 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
2219 
2220 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
2221  __int64_t a64 = _mm_cvtm64_si64(a.x);
2222  __int64_t b64 = _mm_cvtm64_si64(b.x);
2223 
2224  Eigen::half h[4];
2225 
2226  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2227  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2228  h[0] = ha + hb;
2229  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2230  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2231  h[1] = ha + hb;
2232  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2233  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2234  h[2] = ha + hb;
2235  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2236  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2237  h[3] = ha + hb;
2238  Packet4h result;
2239  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2240  return result;
2241 }
2242 
2243 template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
2244  __int64_t a64 = _mm_cvtm64_si64(a.x);
2245  __int64_t b64 = _mm_cvtm64_si64(b.x);
2246 
2247  Eigen::half h[4];
2248 
2249  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2250  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2251  h[0] = ha - hb;
2252  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2253  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2254  h[1] = ha - hb;
2255  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2256  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2257  h[2] = ha - hb;
2258  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2259  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2260  h[3] = ha - hb;
2261  Packet4h result;
2262  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2263  return result;
2264 }
2265 
2266 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
2267  __int64_t a64 = _mm_cvtm64_si64(a.x);
2268  __int64_t b64 = _mm_cvtm64_si64(b.x);
2269 
2270  Eigen::half h[4];
2271 
2272  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2273  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2274  h[0] = ha * hb;
2275  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2276  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2277  h[1] = ha * hb;
2278  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2279  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2280  h[2] = ha * hb;
2281  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2282  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2283  h[3] = ha * hb;
2284  Packet4h result;
2285  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2286  return result;
2287 }
2288 
2289 template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
2290  __int64_t a64 = _mm_cvtm64_si64(a.x);
2291  __int64_t b64 = _mm_cvtm64_si64(b.x);
2292 
2293  Eigen::half h[4];
2294 
2295  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2296  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2297  h[0] = ha / hb;
2298  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2299  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2300  h[1] = ha / hb;
2301  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2302  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2303  h[2] = ha / hb;
2304  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2305  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2306  h[3] = ha / hb;
2307  Packet4h result;
2308  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2309  return result;
2310 }
2311 
2312 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
2313  Packet4h result;
2314  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2315  return result;
2316 }
2317 
2318 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
2319  Packet4h result;
2320  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2321  return result;
2322 }
2323 
2324 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2325  __int64_t r = _mm_cvtm64_si64(from.x);
2326  *(reinterpret_cast<__int64_t*>(to)) = r;
2327 }
2328 
2329 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2330  __int64_t r = _mm_cvtm64_si64(from.x);
2331  *(reinterpret_cast<__int64_t*>(to)) = r;
2332 }
2333 
2334 template<> EIGEN_STRONG_INLINE Packet4h
2335 ploadquad<Packet4h>(const Eigen::half* from) {
2336  return pset1<Packet4h>(*from);
2337 }
2338 
2339 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
2340 {
2341  Packet4h result;
2342  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2343  return result;
2344 }
2345 
2346 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
2347 {
2348  __int64_t a = _mm_cvtm64_si64(from.x);
2349  to[stride*0].x = static_cast<unsigned short>(a);
2350  to[stride*1].x = static_cast<unsigned short>(a >> 16);
2351  to[stride*2].x = static_cast<unsigned short>(a >> 32);
2352  to[stride*3].x = static_cast<unsigned short>(a >> 48);
2353 }
2354 
2355 EIGEN_STRONG_INLINE void
2356 ptranspose(PacketBlock<Packet4h,4>& kernel) {
2357  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
2358  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
2359  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
2360  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
2361 
2362  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
2363  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
2364  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
2365  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
2366 }
2367 
2368 #endif
2369 
2370 } // end namespace internal
2371 
2372 } // end namespace Eigen
2373 
2374 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
2375 // PGI++ does not define the following intrinsics in C++ mode.
2376 static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
2377 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
2378 static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
2379 static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
2380 static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
2381 static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
2382 #endif
2383 
2384 #endif // EIGEN_PACKET_MATH_SSE_H
Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)