$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 // Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6 // Heavily based on Gael's SSE version.
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 
12 #ifndef EIGEN_PACKET_MATH_NEON_H
13 #define EIGEN_PACKET_MATH_NEON_H
14 
15 // IWYU pragma: private
16 #include "../../InternalHeaderCheck.h"
17 
18 namespace Eigen {
19 
20 namespace internal {
21 
22 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
23 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
24 #endif
25 
26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28 #endif
29 
30 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
31 #if EIGEN_ARCH_ARM64
32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33 #else
34 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
35 #endif
36 #endif
37 
38 #if EIGEN_COMP_MSVC_STRICT
39 
40 // In MSVC's arm_neon.h header file, all NEON vector types
41 // are aliases to the same underlying type __n128.
42 // We thus have to wrap them to make them different C++ types.
43 // (See also bug 1428)
44 typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
45 typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
46 typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
47 typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
48 typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
49 typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
50 typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
51 typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
52 typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
53 typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
54 typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
55 typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
56 typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
57 typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
58 typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
59 typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
60 typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
61 typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
62 
63 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
64  float from[4] = {a, b, c, d};
65  return vld1q_f32(from);
66 }
67 
68 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
69  float from[2] = {a, b};
70  return vld1_f32(from);
71 }
72 
73 #else
74 
75 typedef float32x2_t Packet2f;
76 typedef float32x4_t Packet4f;
77 typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
78 typedef int8x8_t Packet8c;
79 typedef int8x16_t Packet16c;
80 typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
81 typedef uint8x8_t Packet8uc;
82 typedef uint8x16_t Packet16uc;
83 typedef int16x4_t Packet4s;
84 typedef int16x8_t Packet8s;
85 typedef uint16x4_t Packet4us;
86 typedef uint16x8_t Packet8us;
87 typedef int32x2_t Packet2i;
88 typedef int32x4_t Packet4i;
89 typedef uint32x2_t Packet2ui;
90 typedef uint32x4_t Packet4ui;
91 typedef int64x2_t Packet2l;
92 typedef uint64x2_t Packet2ul;
93 
94 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
95 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
96 
97 #endif // EIGEN_COMP_MSVC_STRICT
98 
99 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
100  const float* a = reinterpret_cast<const float*>(&m);
101  Packet4f res =
102  make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
103  return res;
104 }
105 
106 // functionally equivalent to _mm_shuffle_ps in SSE when interleave
107 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
108 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
109 // to enable a shared implementation for fast inversion of matrices of size 4.
110 template <bool interleave>
111 EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
112  const float* a = reinterpret_cast<const float*>(&m);
113  const float* b = reinterpret_cast<const float*>(&n);
114  Packet4f res =
115  make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
116  return res;
117 }
118 
119 template <>
120 EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
121  const float* a = reinterpret_cast<const float*>(&m);
122  const float* b = reinterpret_cast<const float*>(&n);
123  Packet4f res =
124  make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
125  return res;
126 }
127 
128 EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
129  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
130 }
131 
132 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
133  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
134 }
135 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
136  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
137 }
138 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
139  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
140 }
141 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
142  return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
143 }
144 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
145  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
146 }
147 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
148  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
149 }
150 #define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
151 
152 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
153 
154 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
155  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
156 
157 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
158 
159 #if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
160 // __builtin_prefetch tends to do nothing on ARM64 compilers because the
161 // prefetch instructions there are too detailed for __builtin_prefetch to map
162 // meaningfully to them.
163 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
164 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
165 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
166 #elif defined __pld
167 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
168 #elif EIGEN_ARCH_ARM
169 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
170 #else
171 // by default no explicit prefetching
172 #define EIGEN_ARM_PREFETCH(ADDR)
173 #endif
174 
175 template <>
176 struct packet_traits<float> : default_packet_traits {
177  typedef Packet4f type;
178  typedef Packet2f half;
179  enum {
180  Vectorizable = 1,
181  AlignedOnScalar = 1,
182  size = 4,
183 
184  HasCmp = 1,
185  HasAdd = 1,
186  HasSub = 1,
187  HasShift = 1,
188  HasMul = 1,
189  HasNegate = 1,
190  HasAbs = 1,
191  HasArg = 0,
192  HasAbs2 = 1,
193  HasAbsDiff = 1,
194  HasMin = 1,
195  HasMax = 1,
196  HasConj = 1,
197  HasSetLinear = 1,
198  HasBlend = 0,
199  HasDiv = 1,
200  HasSin = EIGEN_FAST_MATH,
201  HasCos = EIGEN_FAST_MATH,
202  HasACos = 1,
203  HasASin = 1,
204  HasATan = 1,
205  HasATanh = 1,
206  HasLog = 1,
207  HasExp = 1,
208  HasPow = 1,
209  HasSqrt = 1,
210  HasRsqrt = 1,
211  HasCbrt = 1,
212  HasTanh = EIGEN_FAST_MATH,
213  HasErf = EIGEN_FAST_MATH,
214  HasErfc = EIGEN_FAST_MATH,
215  HasBessel = 0, // Issues with accuracy.
216  HasNdtri = 0
217  };
218 };
219 
220 template <>
221 struct packet_traits<int8_t> : default_packet_traits {
222  typedef Packet16c type;
223  typedef Packet8c half;
224  enum {
225  Vectorizable = 1,
226  AlignedOnScalar = 1,
227  size = 16,
228 
229  HasCmp = 1,
230  HasAdd = 1,
231  HasSub = 1,
232  HasShift = 1,
233  HasMul = 1,
234  HasNegate = 1,
235  HasAbs = 1,
236  HasAbsDiff = 1,
237  HasArg = 0,
238  HasAbs2 = 1,
239  HasMin = 1,
240  HasMax = 1,
241  HasConj = 1,
242  HasSetLinear = 1,
243  HasBlend = 0
244  };
245 };
246 
247 template <>
248 struct packet_traits<uint8_t> : default_packet_traits {
249  typedef Packet16uc type;
250  typedef Packet8uc half;
251  enum {
252  Vectorizable = 1,
253  AlignedOnScalar = 1,
254  size = 16,
255 
256  HasCmp = 1,
257  HasAdd = 1,
258  HasSub = 1,
259  HasShift = 1,
260  HasMul = 1,
261  HasNegate = 0,
262  HasAbs = 1,
263  HasAbsDiff = 1,
264  HasArg = 0,
265  HasAbs2 = 1,
266  HasMin = 1,
267  HasMax = 1,
268  HasConj = 1,
269  HasSetLinear = 1,
270  HasBlend = 0,
271 
272  HasSqrt = 1
273  };
274 };
275 
276 template <>
277 struct packet_traits<int16_t> : default_packet_traits {
278  typedef Packet8s type;
279  typedef Packet4s half;
280  enum {
281  Vectorizable = 1,
282  AlignedOnScalar = 1,
283  size = 8,
284 
285  HasCmp = 1,
286  HasAdd = 1,
287  HasSub = 1,
288  HasShift = 1,
289  HasMul = 1,
290  HasNegate = 1,
291  HasAbs = 1,
292  HasAbsDiff = 1,
293  HasArg = 0,
294  HasAbs2 = 1,
295  HasMin = 1,
296  HasMax = 1,
297  HasConj = 1,
298  HasSetLinear = 1,
299  HasBlend = 0
300  };
301 };
302 
303 template <>
304 struct packet_traits<uint16_t> : default_packet_traits {
305  typedef Packet8us type;
306  typedef Packet4us half;
307  enum {
308  Vectorizable = 1,
309  AlignedOnScalar = 1,
310  size = 8,
311 
312  HasCmp = 1,
313  HasAdd = 1,
314  HasSub = 1,
315  HasShift = 1,
316  HasMul = 1,
317  HasNegate = 0,
318  HasAbs = 1,
319  HasAbsDiff = 1,
320  HasArg = 0,
321  HasAbs2 = 1,
322  HasMin = 1,
323  HasMax = 1,
324  HasConj = 1,
325  HasSetLinear = 1,
326  HasBlend = 0,
327  HasSqrt = 1
328  };
329 };
330 
331 template <>
332 struct packet_traits<int32_t> : default_packet_traits {
333  typedef Packet4i type;
334  typedef Packet2i half;
335  enum {
336  Vectorizable = 1,
337  AlignedOnScalar = 1,
338  size = 4,
339 
340  HasCmp = 1,
341  HasAdd = 1,
342  HasSub = 1,
343  HasShift = 1,
344  HasMul = 1,
345  HasNegate = 1,
346  HasAbs = 1,
347  HasArg = 0,
348  HasAbs2 = 1,
349  HasAbsDiff = 1,
350  HasMin = 1,
351  HasMax = 1,
352  HasConj = 1,
353  HasSetLinear = 1,
354  HasBlend = 0
355  };
356 };
357 
358 template <>
359 struct packet_traits<uint32_t> : default_packet_traits {
360  typedef Packet4ui type;
361  typedef Packet2ui half;
362  enum {
363  Vectorizable = 1,
364  AlignedOnScalar = 1,
365  size = 4,
366 
367  HasCmp = 1,
368  HasAdd = 1,
369  HasSub = 1,
370  HasShift = 1,
371  HasMul = 1,
372  HasNegate = 0,
373  HasAbs = 1,
374  HasArg = 0,
375  HasAbs2 = 1,
376  HasAbsDiff = 1,
377  HasMin = 1,
378  HasMax = 1,
379  HasConj = 1,
380  HasSetLinear = 1,
381  HasBlend = 0,
382 
383  HasSqrt = 1
384  };
385 };
386 
387 template <>
388 struct packet_traits<int64_t> : default_packet_traits {
389  typedef Packet2l type;
390  typedef Packet2l half;
391  enum {
392  Vectorizable = 1,
393  AlignedOnScalar = 1,
394  size = 2,
395 
396  HasCmp = 1,
397  HasAdd = 1,
398  HasSub = 1,
399  HasShift = 1,
400  HasMul = 1,
401  HasNegate = 1,
402  HasAbs = 1,
403  HasArg = 0,
404  HasAbs2 = 1,
405  HasAbsDiff = 1,
406  HasMin = 1,
407  HasMax = 1,
408  HasConj = 1,
409  HasSetLinear = 1,
410  HasBlend = 0
411  };
412 };
413 
414 template <>
415 struct packet_traits<uint64_t> : default_packet_traits {
416  typedef Packet2ul type;
417  typedef Packet2ul half;
418  enum {
419  Vectorizable = 1,
420  AlignedOnScalar = 1,
421  size = 2,
422 
423  HasCmp = 1,
424  HasAdd = 1,
425  HasSub = 1,
426  HasShift = 1,
427  HasMul = 1,
428  HasNegate = 0,
429  HasAbs = 1,
430  HasArg = 0,
431  HasAbs2 = 1,
432  HasAbsDiff = 1,
433  HasMin = 1,
434  HasMax = 1,
435  HasConj = 1,
436  HasSetLinear = 1,
437  HasBlend = 0
438  };
439 };
440 
441 template <typename Packet, typename Scalar>
442 struct neon_unpacket_default {
443  using type = Scalar;
444  using half = Packet;
445  static constexpr int size = sizeof(Packet) / sizeof(Scalar);
446  static constexpr int alignment = sizeof(Packet);
447  static constexpr bool vectorizable = true;
448  static constexpr bool masked_load_available = false;
449  static constexpr bool masked_store_available = false;
450 };
451 
452 template <>
453 struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
454  using integer_packet = Packet2i;
455 };
456 template <>
457 struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
458  using half = Packet2f;
459  using integer_packet = Packet4i;
460 };
461 template <>
462 struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
463 template <>
464 struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
465  using half = Packet4c;
466 };
467 template <>
468 struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
469  using half = Packet8c;
470 };
471 template <>
472 struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
473 template <>
474 struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
475  using half = Packet4uc;
476 };
477 template <>
478 struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
479  using half = Packet8uc;
480 };
481 template <>
482 struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
483 template <>
484 struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
485  using half = Packet4s;
486 };
487 template <>
488 struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
489 template <>
490 struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
491  using half = Packet4us;
492 };
493 template <>
494 struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
495 template <>
496 struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
497  using half = Packet2i;
498 };
499 template <>
500 struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
501 template <>
502 struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
503  using half = Packet2ui;
504 };
505 template <>
506 struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
507 template <>
508 struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
509 
510 template <>
511 EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
512  return vdup_n_f32(0.0f);
513 }
514 
515 template <>
516 EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
517  return vdupq_n_f32(0.0f);
518 }
519 
520 template <>
521 EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
522  return vdup_n_f32(from);
523 }
524 template <>
525 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
526  return vdupq_n_f32(from);
527 }
528 template <>
529 EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from) {
530  return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
531 }
532 template <>
533 EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) {
534  return vdup_n_s8(from);
535 }
536 template <>
537 EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
538  return vdupq_n_s8(from);
539 }
540 template <>
541 EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from) {
542  return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
543 }
544 template <>
545 EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) {
546  return vdup_n_u8(from);
547 }
548 template <>
549 EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
550  return vdupq_n_u8(from);
551 }
552 template <>
553 EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) {
554  return vdup_n_s16(from);
555 }
556 template <>
557 EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
558  return vdupq_n_s16(from);
559 }
560 template <>
561 EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) {
562  return vdup_n_u16(from);
563 }
564 template <>
565 EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
566  return vdupq_n_u16(from);
567 }
568 template <>
569 EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) {
570  return vdup_n_s32(from);
571 }
572 template <>
573 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
574  return vdupq_n_s32(from);
575 }
576 template <>
577 EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) {
578  return vdup_n_u32(from);
579 }
580 template <>
581 EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
582  return vdupq_n_u32(from);
583 }
584 template <>
585 EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
586  return vdupq_n_s64(from);
587 }
588 template <>
589 EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
590  return vdupq_n_u64(from);
591 }
592 
593 template <>
594 EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
595  return vreinterpret_f32_u32(vdup_n_u32(from));
596 }
597 template <>
598 EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
599  return vreinterpretq_f32_u32(vdupq_n_u32(from));
600 }
601 
602 template <>
603 EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a) {
604  const float c[] = {0.0f, 1.0f};
605  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
606 }
607 template <>
608 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
609  const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
610  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
611 }
612 template <>
613 EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a) {
614  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
615 }
616 template <>
617 EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a) {
618  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
619  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
620 }
621 template <>
622 EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
623  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
624  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
625 }
626 template <>
627 EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a) {
628  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
629 }
630 template <>
631 EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a) {
632  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
633  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
634 }
635 template <>
636 EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
637  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
638  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
639 }
640 template <>
641 EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a) {
642  const int16_t c[] = {0, 1, 2, 3};
643  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
644 }
645 template <>
646 EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a) {
647  const uint16_t c[] = {0, 1, 2, 3};
648  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
649 }
650 template <>
651 EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
652  const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
653  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
654 }
655 template <>
656 EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
657  const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
658  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
659 }
660 template <>
661 EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a) {
662  const int32_t c[] = {0, 1};
663  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
664 }
665 template <>
666 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
667  const int32_t c[] = {0, 1, 2, 3};
668  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
669 }
670 template <>
671 EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a) {
672  const uint32_t c[] = {0, 1};
673  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
674 }
675 template <>
676 EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
677  const uint32_t c[] = {0, 1, 2, 3};
678  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
679 }
680 template <>
681 EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
682  const int64_t c[] = {0, 1};
683  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
684 }
685 template <>
686 EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
687  const uint64_t c[] = {0, 1};
688  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
689 }
690 
691 template <>
692 EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) {
693  return vadd_f32(a, b);
694 }
695 template <>
696 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
697  return vaddq_f32(a, b);
698 }
699 template <>
700 EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b) {
701  return vget_lane_s32(
702  vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
703 }
704 template <>
705 EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) {
706  return vadd_s8(a, b);
707 }
708 template <>
709 EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
710  return vaddq_s8(a, b);
711 }
712 template <>
713 EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
714  return vget_lane_u32(
715  vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
716 }
717 template <>
718 EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
719  return vadd_u8(a, b);
720 }
721 template <>
722 EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
723  return vaddq_u8(a, b);
724 }
725 template <>
726 EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) {
727  return vadd_s16(a, b);
728 }
729 template <>
730 EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
731  return vaddq_s16(a, b);
732 }
733 template <>
734 EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) {
735  return vadd_u16(a, b);
736 }
737 template <>
738 EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
739  return vaddq_u16(a, b);
740 }
741 template <>
742 EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) {
743  return vadd_s32(a, b);
744 }
745 template <>
746 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
747  return vaddq_s32(a, b);
748 }
749 template <>
750 EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
751  return vadd_u32(a, b);
752 }
753 template <>
754 EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
755  return vaddq_u32(a, b);
756 }
757 template <>
758 EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
759  return vaddq_s64(a, b);
760 }
761 template <>
762 EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
763  return vaddq_u64(a, b);
764 }
765 
766 template <>
767 EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) {
768  return vsub_f32(a, b);
769 }
770 template <>
771 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
772  return vsubq_f32(a, b);
773 }
774 template <>
775 EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b) {
776  return vget_lane_s32(
777  vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
778 }
779 template <>
780 EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) {
781  return vsub_s8(a, b);
782 }
783 template <>
784 EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
785  return vsubq_s8(a, b);
786 }
787 template <>
788 EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
789  return vget_lane_u32(
790  vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
791 }
792 template <>
793 EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
794  return vsub_u8(a, b);
795 }
796 template <>
797 EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
798  return vsubq_u8(a, b);
799 }
800 template <>
801 EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) {
802  return vsub_s16(a, b);
803 }
804 template <>
805 EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
806  return vsubq_s16(a, b);
807 }
808 template <>
809 EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) {
810  return vsub_u16(a, b);
811 }
812 template <>
813 EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
814  return vsubq_u16(a, b);
815 }
816 template <>
817 EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) {
818  return vsub_s32(a, b);
819 }
820 template <>
821 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
822  return vsubq_s32(a, b);
823 }
824 template <>
825 EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
826  return vsub_u32(a, b);
827 }
828 template <>
829 EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
830  return vsubq_u32(a, b);
831 }
832 template <>
833 EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
834  return vsubq_s64(a, b);
835 }
836 template <>
837 EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
838  return vsubq_u64(a, b);
839 }
840 
841 template <>
842 EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
843 template <>
844 EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f& b) {
845  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
846  return padd(a, pxor(mask, b));
847 }
848 template <>
849 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
850 template <>
851 EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
852  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
853  return padd(a, pxor(mask, b));
854 }
855 
856 template <>
857 EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) {
858  return vneg_f32(a);
859 }
860 template <>
861 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
862  return vnegq_f32(a);
863 }
864 template <>
865 EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) {
866  return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
867 }
868 template <>
869 EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) {
870  return vneg_s8(a);
871 }
872 template <>
873 EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
874  return vnegq_s8(a);
875 }
876 template <>
877 EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) {
878  return vneg_s16(a);
879 }
880 template <>
881 EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
882  return vnegq_s16(a);
883 }
884 template <>
885 EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) {
886  return vneg_s32(a);
887 }
888 template <>
889 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
890  return vnegq_s32(a);
891 }
892 template <>
893 EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
894 #if EIGEN_ARCH_ARM64
895  return vnegq_s64(a);
896 #else
897  return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
898 #endif
899 }
900 
901 template <>
902 EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) {
903  return a;
904 }
905 template <>
906 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
907  return a;
908 }
909 template <>
910 EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) {
911  return a;
912 }
913 template <>
914 EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) {
915  return a;
916 }
917 template <>
918 EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
919  return a;
920 }
921 template <>
922 EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) {
923  return a;
924 }
925 template <>
926 EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) {
927  return a;
928 }
929 template <>
930 EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
931  return a;
932 }
933 template <>
934 EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) {
935  return a;
936 }
937 template <>
938 EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
939  return a;
940 }
941 template <>
942 EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) {
943  return a;
944 }
945 template <>
946 EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
947  return a;
948 }
949 template <>
950 EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) {
951  return a;
952 }
953 template <>
954 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
955  return a;
956 }
957 template <>
958 EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) {
959  return a;
960 }
961 template <>
962 EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
963  return a;
964 }
965 template <>
966 EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
967  return a;
968 }
969 template <>
970 EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
971  return a;
972 }
973 
974 template <>
975 EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) {
976  return vmul_f32(a, b);
977 }
978 template <>
979 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
980  return vmulq_f32(a, b);
981 }
982 template <>
983 EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b) {
984  return vget_lane_s32(
985  vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
986 }
987 template <>
988 EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) {
989  return vmul_s8(a, b);
990 }
991 template <>
992 EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
993  return vmulq_s8(a, b);
994 }
995 template <>
996 EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
997  return vget_lane_u32(
998  vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
999 }
1000 template <>
1001 EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1002  return vmul_u8(a, b);
1003 }
1004 template <>
1005 EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1006  return vmulq_u8(a, b);
1007 }
1008 template <>
1009 EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) {
1010  return vmul_s16(a, b);
1011 }
1012 template <>
1013 EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1014  return vmulq_s16(a, b);
1015 }
1016 template <>
1017 EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) {
1018  return vmul_u16(a, b);
1019 }
1020 template <>
1021 EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1022  return vmulq_u16(a, b);
1023 }
1024 template <>
1025 EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) {
1026  return vmul_s32(a, b);
1027 }
1028 template <>
1029 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1030  return vmulq_s32(a, b);
1031 }
1032 template <>
1033 EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1034  return vmul_u32(a, b);
1035 }
1036 template <>
1037 EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1038  return vmulq_u32(a, b);
1039 }
1040 template <>
1041 EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
1042  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
1043  vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
1044 }
1045 template <>
1046 EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1047  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
1048  vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
1049 }
1050 
1051 template <>
1052 EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) {
1053  eigen_assert(false && "packet integer division are not supported by NEON");
1054  return pset1<Packet4c>(0);
1055 }
1056 template <>
1057 EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/) {
1058  eigen_assert(false && "packet integer division are not supported by NEON");
1059  return pset1<Packet8c>(0);
1060 }
1061 template <>
1062 EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/) {
1063  eigen_assert(false && "packet integer division are not supported by NEON");
1064  return pset1<Packet16c>(0);
1065 }
1066 template <>
1067 EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/) {
1068  eigen_assert(false && "packet integer division are not supported by NEON");
1069  return pset1<Packet4uc>(0);
1070 }
1071 template <>
1072 EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/) {
1073  eigen_assert(false && "packet integer division are not supported by NEON");
1074  return pset1<Packet8uc>(0);
1075 }
1076 template <>
1077 EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/) {
1078  eigen_assert(false && "packet integer division are not supported by NEON");
1079  return pset1<Packet16uc>(0);
1080 }
1081 template <>
1082 EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/) {
1083  eigen_assert(false && "packet integer division are not supported by NEON");
1084  return pset1<Packet4s>(0);
1085 }
1086 template <>
1087 EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
1088  eigen_assert(false && "packet integer division are not supported by NEON");
1089  return pset1<Packet8s>(0);
1090 }
1091 template <>
1092 EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/) {
1093  eigen_assert(false && "packet integer division are not supported by NEON");
1094  return pset1<Packet4us>(0);
1095 }
1096 template <>
1097 EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
1098  eigen_assert(false && "packet integer division are not supported by NEON");
1099  return pset1<Packet8us>(0);
1100 }
1101 template <>
1102 EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/) {
1103  eigen_assert(false && "packet integer division are not supported by NEON");
1104  return pset1<Packet2i>(0);
1105 }
1106 template <>
1107 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
1108  eigen_assert(false && "packet integer division are not supported by NEON");
1109  return pset1<Packet4i>(0);
1110 }
1111 template <>
1112 EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/) {
1113  eigen_assert(false && "packet integer division are not supported by NEON");
1114  return pset1<Packet2ui>(0);
1115 }
1116 template <>
1117 EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
1118  eigen_assert(false && "packet integer division are not supported by NEON");
1119  return pset1<Packet4ui>(0);
1120 }
1121 template <>
1122 EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
1123  eigen_assert(false && "packet integer division are not supported by NEON");
1124  return pset1<Packet2l>(0LL);
1125 }
1126 template <>
1127 EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
1128  eigen_assert(false && "packet integer division are not supported by NEON");
1129  return pset1<Packet2ul>(0ULL);
1130 }
1131 
1132 #ifdef EIGEN_VECTORIZE_FMA
1133 template <>
1134 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1135  return vfmaq_f32(c, a, b);
1136 }
1137 template <>
1138 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1139  return vfma_f32(c, a, b);
1140 }
1141 template <>
1142 EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1143  return vfmsq_f32(c, a, b);
1144 }
1145 template <>
1146 EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1147  return vfms_f32(c, a, b);
1148 }
1149 #else
1150 template <>
1151 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1152  return vmlaq_f32(c, a, b);
1153 }
1154 template <>
1155 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1156  return vmla_f32(c, a, b);
1157 }
1158 template <>
1159 EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1160  return vmlsq_f32(c, a, b);
1161 }
1162 template <>
1163 EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1164  return vmls_f32(c, a, b);
1165 }
1166 #endif
1167 template <>
1168 EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1169  return pnegate(pnmadd(a, b, c));
1170 }
1171 template <>
1172 EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1173  return pnegate(pnmadd(a, b, c));
1174 }
1175 template <>
1176 EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1177  return pnegate(pmadd(a, b, c));
1178 }
1179 template <>
1180 EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1181  return pnegate(pmadd(a, b, c));
1182 }
1183 
1184 // No FMA instruction for int, so use MLA unconditionally.
1185 template <>
1186 EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) {
1187  return vget_lane_s32(
1188  vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
1189  vreinterpret_s8_s32(vdup_n_s32(b)))),
1190  0);
1191 }
1192 template <>
1193 EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) {
1194  return vmla_s8(c, a, b);
1195 }
1196 template <>
1197 EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
1198  return vmlaq_s8(c, a, b);
1199 }
1200 template <>
1201 EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) {
1202  return vget_lane_u32(
1203  vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
1204  vreinterpret_u8_u32(vdup_n_u32(b)))),
1205  0);
1206 }
1207 template <>
1208 EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) {
1209  return vmla_u8(c, a, b);
1210 }
1211 template <>
1212 EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
1213  return vmlaq_u8(c, a, b);
1214 }
1215 template <>
1216 EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) {
1217  return vmla_s16(c, a, b);
1218 }
1219 template <>
1220 EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1221  return vmlaq_s16(c, a, b);
1222 }
1223 template <>
1224 EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) {
1225  return vmla_u16(c, a, b);
1226 }
1227 template <>
1228 EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1229  return vmlaq_u16(c, a, b);
1230 }
1231 template <>
1232 EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) {
1233  return vmla_s32(c, a, b);
1234 }
1235 template <>
1236 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1237  return vmlaq_s32(c, a, b);
1238 }
1239 template <>
1240 EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) {
1241  return vmla_u32(c, a, b);
1242 }
1243 template <>
1244 EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
1245  return vmlaq_u32(c, a, b);
1246 }
1247 
1248 template <>
1249 EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b) {
1250  return vabd_f32(a, b);
1251 }
1252 template <>
1253 EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
1254  return vabdq_f32(a, b);
1255 }
1256 template <>
1257 EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b) {
1258  return vget_lane_s32(
1259  vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1260 }
1261 template <>
1262 EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b) {
1263  return vabd_s8(a, b);
1264 }
1265 template <>
1266 EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
1267  return vabdq_s8(a, b);
1268 }
1269 template <>
1270 EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1271  return vget_lane_u32(
1272  vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1273 }
1274 template <>
1275 EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1276  return vabd_u8(a, b);
1277 }
1278 template <>
1279 EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1280  return vabdq_u8(a, b);
1281 }
1282 template <>
1283 EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b) {
1284  return vabd_s16(a, b);
1285 }
1286 template <>
1287 EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
1288  return vabdq_s16(a, b);
1289 }
1290 template <>
1291 EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b) {
1292  return vabd_u16(a, b);
1293 }
1294 template <>
1295 EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
1296  return vabdq_u16(a, b);
1297 }
1298 template <>
1299 EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b) {
1300  return vabd_s32(a, b);
1301 }
1302 template <>
1303 EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
1304  return vabdq_s32(a, b);
1305 }
1306 template <>
1307 EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1308  return vabd_u32(a, b);
1309 }
1310 template <>
1311 EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1312  return vabdq_u32(a, b);
1313 }
1314 
1315 template <>
1316 EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) {
1317  return vmin_f32(a, b);
1318 }
1319 template <>
1320 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1321  return vminq_f32(a, b);
1322 }
1323 
1324 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1325 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1326 // systems).
1327 template <>
1328 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1329  return vminnmq_f32(a, b);
1330 }
1331 template <>
1332 EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1333  return vminnm_f32(a, b);
1334 }
1335 #endif
1336 
1337 template <>
1338 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1339  return pmin<Packet4f>(a, b);
1340 }
1341 
1342 template <>
1343 EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
1344  return pmin<Packet2f>(a, b);
1345 }
1346 
1347 template <>
1348 EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b) {
1349  return vget_lane_s32(
1350  vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1351 }
1352 template <>
1353 EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) {
1354  return vmin_s8(a, b);
1355 }
1356 template <>
1357 EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1358  return vminq_s8(a, b);
1359 }
1360 template <>
1361 EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1362  return vget_lane_u32(
1363  vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1364 }
1365 template <>
1366 EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1367  return vmin_u8(a, b);
1368 }
1369 template <>
1370 EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1371  return vminq_u8(a, b);
1372 }
1373 template <>
1374 EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) {
1375  return vmin_s16(a, b);
1376 }
1377 template <>
1378 EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1379  return vminq_s16(a, b);
1380 }
1381 template <>
1382 EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) {
1383  return vmin_u16(a, b);
1384 }
1385 template <>
1386 EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1387  return vminq_u16(a, b);
1388 }
1389 template <>
1390 EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) {
1391  return vmin_s32(a, b);
1392 }
1393 template <>
1394 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1395  return vminq_s32(a, b);
1396 }
1397 template <>
1398 EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1399  return vmin_u32(a, b);
1400 }
1401 template <>
1402 EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1403  return vminq_u32(a, b);
1404 }
1405 template <>
1406 EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1407  return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1408  vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1409 }
1410 template <>
1411 EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1412  return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1413  vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1414 }
1415 
1416 template <>
1417 EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) {
1418  return vmax_f32(a, b);
1419 }
1420 template <>
1421 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1422  return vmaxq_f32(a, b);
1423 }
1424 
1425 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1426 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1427 // systems).
1428 template <>
1429 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1430  return vmaxnmq_f32(a, b);
1431 }
1432 template <>
1433 EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1434  return vmaxnm_f32(a, b);
1435 }
1436 #endif
1437 
1438 template <>
1439 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1440  return pmax<Packet4f>(a, b);
1441 }
1442 
1443 template <>
1444 EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
1445  return pmax<Packet2f>(a, b);
1446 }
1447 
1448 template <>
1449 EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b) {
1450  return vget_lane_s32(
1451  vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1452 }
1453 template <>
1454 EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) {
1455  return vmax_s8(a, b);
1456 }
1457 template <>
1458 EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1459  return vmaxq_s8(a, b);
1460 }
1461 template <>
1462 EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1463  return vget_lane_u32(
1464  vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1465 }
1466 template <>
1467 EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1468  return vmax_u8(a, b);
1469 }
1470 template <>
1471 EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1472  return vmaxq_u8(a, b);
1473 }
1474 template <>
1475 EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) {
1476  return vmax_s16(a, b);
1477 }
1478 template <>
1479 EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1480  return vmaxq_s16(a, b);
1481 }
1482 template <>
1483 EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) {
1484  return vmax_u16(a, b);
1485 }
1486 template <>
1487 EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1488  return vmaxq_u16(a, b);
1489 }
1490 template <>
1491 EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) {
1492  return vmax_s32(a, b);
1493 }
1494 template <>
1495 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1496  return vmaxq_s32(a, b);
1497 }
1498 template <>
1499 EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1500  return vmax_u32(a, b);
1501 }
1502 template <>
1503 EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1504  return vmaxq_u32(a, b);
1505 }
1506 template <>
1507 EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1508  return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1509  vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1510 }
1511 template <>
1512 EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1513  return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1514  vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1515 }
1516 
1517 template <>
1518 EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b) {
1519  return vreinterpret_f32_u32(vcle_f32(a, b));
1520 }
1521 template <>
1522 EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1523  return vreinterpretq_f32_u32(vcleq_f32(a, b));
1524 }
1525 template <>
1526 EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b) {
1527  return vget_lane_s32(
1528  vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1529 }
1530 template <>
1531 EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b) {
1532  return vreinterpret_s8_u8(vcle_s8(a, b));
1533 }
1534 template <>
1535 EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
1536  return vreinterpretq_s8_u8(vcleq_s8(a, b));
1537 }
1538 template <>
1539 EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1540  return vget_lane_u32(
1541  vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1542 }
1543 template <>
1544 EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1545  return vcle_u8(a, b);
1546 }
1547 template <>
1548 EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1549  return vcleq_u8(a, b);
1550 }
1551 template <>
1552 EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b) {
1553  return vreinterpret_s16_u16(vcle_s16(a, b));
1554 }
1555 template <>
1556 EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
1557  return vreinterpretq_s16_u16(vcleq_s16(a, b));
1558 }
1559 template <>
1560 EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b) {
1561  return vcle_u16(a, b);
1562 }
1563 template <>
1564 EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
1565  return vcleq_u16(a, b);
1566 }
1567 template <>
1568 EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b) {
1569  return vreinterpret_s32_u32(vcle_s32(a, b));
1570 }
1571 template <>
1572 EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
1573  return vreinterpretq_s32_u32(vcleq_s32(a, b));
1574 }
1575 template <>
1576 EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1577  return vcle_u32(a, b);
1578 }
1579 template <>
1580 EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1581  return vcleq_u32(a, b);
1582 }
1583 template <>
1584 EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
1585 #if EIGEN_ARCH_ARM64
1586  return vreinterpretq_s64_u64(vcleq_s64(a, b));
1587 #else
1588  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1589  vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1590 #endif
1591 }
1592 template <>
1593 EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1594 #if EIGEN_ARCH_ARM64
1595  return vcleq_u64(a, b);
1596 #else
1597  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1598  vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1599 #endif
1600 }
1601 
1602 template <>
1603 EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b) {
1604  return vreinterpret_f32_u32(vclt_f32(a, b));
1605 }
1606 template <>
1607 EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1608  return vreinterpretq_f32_u32(vcltq_f32(a, b));
1609 }
1610 template <>
1611 EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b) {
1612  return vget_lane_s32(
1613  vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1614 }
1615 template <>
1616 EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b) {
1617  return vreinterpret_s8_u8(vclt_s8(a, b));
1618 }
1619 template <>
1620 EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
1621  return vreinterpretq_s8_u8(vcltq_s8(a, b));
1622 }
1623 template <>
1624 EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1625  return vget_lane_u32(
1626  vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1627 }
1628 template <>
1629 EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1630  return vclt_u8(a, b);
1631 }
1632 template <>
1633 EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1634  return vcltq_u8(a, b);
1635 }
1636 template <>
1637 EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b) {
1638  return vreinterpret_s16_u16(vclt_s16(a, b));
1639 }
1640 template <>
1641 EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
1642  return vreinterpretq_s16_u16(vcltq_s16(a, b));
1643 }
1644 template <>
1645 EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b) {
1646  return vclt_u16(a, b);
1647 }
1648 template <>
1649 EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
1650  return vcltq_u16(a, b);
1651 }
1652 template <>
1653 EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b) {
1654  return vreinterpret_s32_u32(vclt_s32(a, b));
1655 }
1656 template <>
1657 EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
1658  return vreinterpretq_s32_u32(vcltq_s32(a, b));
1659 }
1660 template <>
1661 EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1662  return vclt_u32(a, b);
1663 }
1664 template <>
1665 EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1666  return vcltq_u32(a, b);
1667 }
1668 template <>
1669 EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
1670 #if EIGEN_ARCH_ARM64
1671  return vreinterpretq_s64_u64(vcltq_s64(a, b));
1672 #else
1673  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1674  vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1675 #endif
1676 }
1677 template <>
1678 EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1679 #if EIGEN_ARCH_ARM64
1680  return vcltq_u64(a, b);
1681 #else
1682  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1683  vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1684 #endif
1685 }
1686 
1687 template <>
1688 EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b) {
1689  return vreinterpret_f32_u32(vceq_f32(a, b));
1690 }
1691 template <>
1692 EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1693  return vreinterpretq_f32_u32(vceqq_f32(a, b));
1694 }
1695 template <>
1696 EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b) {
1697  return vget_lane_s32(
1698  vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1699 }
1700 template <>
1701 EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b) {
1702  return vreinterpret_s8_u8(vceq_s8(a, b));
1703 }
1704 template <>
1705 EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
1706  return vreinterpretq_s8_u8(vceqq_s8(a, b));
1707 }
1708 template <>
1709 EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1710  return vget_lane_u32(
1711  vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1712 }
1713 template <>
1714 EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1715  return vceq_u8(a, b);
1716 }
1717 template <>
1718 EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1719  return vceqq_u8(a, b);
1720 }
1721 template <>
1722 EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b) {
1723  return vreinterpret_s16_u16(vceq_s16(a, b));
1724 }
1725 template <>
1726 EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
1727  return vreinterpretq_s16_u16(vceqq_s16(a, b));
1728 }
1729 template <>
1730 EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b) {
1731  return vceq_u16(a, b);
1732 }
1733 template <>
1734 EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
1735  return vceqq_u16(a, b);
1736 }
1737 template <>
1738 EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b) {
1739  return vreinterpret_s32_u32(vceq_s32(a, b));
1740 }
1741 template <>
1742 EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
1743  return vreinterpretq_s32_u32(vceqq_s32(a, b));
1744 }
1745 template <>
1746 EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1747  return vceq_u32(a, b);
1748 }
1749 template <>
1750 EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1751  return vceqq_u32(a, b);
1752 }
1753 template <>
1754 EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
1755 #if EIGEN_ARCH_ARM64
1756  return vreinterpretq_s64_u64(vceqq_s64(a, b));
1757 #else
1758  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1759  vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1760 #endif
1761 }
1762 template <>
1763 EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1764 #if EIGEN_ARCH_ARM64
1765  return vceqq_u64(a, b);
1766 #else
1767  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1768  vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1769 #endif
1770 }
1771 
1772 template <>
1773 EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b) {
1774  return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
1775 }
1776 template <>
1777 EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
1778  return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
1779 }
1780 
1781 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
1782 template <>
1783 EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b) {
1784  return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1785 }
1786 template <>
1787 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1788  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1789 }
1790 template <>
1791 EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b) {
1792  return a & b;
1793 }
1794 template <>
1795 EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b) {
1796  return vand_s8(a, b);
1797 }
1798 template <>
1799 EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
1800  return vandq_s8(a, b);
1801 }
1802 template <>
1803 EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1804  return a & b;
1805 }
1806 template <>
1807 EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1808  return vand_u8(a, b);
1809 }
1810 template <>
1811 EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1812  return vandq_u8(a, b);
1813 }
1814 template <>
1815 EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) {
1816  return vand_s16(a, b);
1817 }
1818 template <>
1819 EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
1820  return vandq_s16(a, b);
1821 }
1822 template <>
1823 EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b) {
1824  return vand_u16(a, b);
1825 }
1826 template <>
1827 EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1828  return vandq_u16(a, b);
1829 }
1830 template <>
1831 EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) {
1832  return vand_s32(a, b);
1833 }
1834 template <>
1835 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1836  return vandq_s32(a, b);
1837 }
1838 template <>
1839 EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1840  return vand_u32(a, b);
1841 }
1842 template <>
1843 EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1844  return vandq_u32(a, b);
1845 }
1846 template <>
1847 EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
1848  return vandq_s64(a, b);
1849 }
1850 template <>
1851 EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1852  return vandq_u64(a, b);
1853 }
1854 
1855 template <>
1856 EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b) {
1857  return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1858 }
1859 template <>
1860 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1861  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1862 }
1863 template <>
1864 EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b) {
1865  return a | b;
1866 }
1867 template <>
1868 EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) {
1869  return vorr_s8(a, b);
1870 }
1871 template <>
1872 EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
1873  return vorrq_s8(a, b);
1874 }
1875 template <>
1876 EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1877  return a | b;
1878 }
1879 template <>
1880 EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1881  return vorr_u8(a, b);
1882 }
1883 template <>
1884 EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1885  return vorrq_u8(a, b);
1886 }
1887 template <>
1888 EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b) {
1889  return vorr_s16(a, b);
1890 }
1891 template <>
1892 EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1893  return vorrq_s16(a, b);
1894 }
1895 template <>
1896 EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b) {
1897  return vorr_u16(a, b);
1898 }
1899 template <>
1900 EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1901  return vorrq_u16(a, b);
1902 }
1903 template <>
1904 EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) {
1905  return vorr_s32(a, b);
1906 }
1907 template <>
1908 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1909  return vorrq_s32(a, b);
1910 }
1911 template <>
1912 EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1913  return vorr_u32(a, b);
1914 }
1915 template <>
1916 EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1917  return vorrq_u32(a, b);
1918 }
1919 template <>
1920 EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
1921  return vorrq_s64(a, b);
1922 }
1923 template <>
1924 EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1925  return vorrq_u64(a, b);
1926 }
1927 
1928 template <>
1929 EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b) {
1930  return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1931 }
1932 template <>
1933 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1934  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1935 }
1936 template <>
1937 EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b) {
1938  return a ^ b;
1939 }
1940 template <>
1941 EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b) {
1942  return veor_s8(a, b);
1943 }
1944 template <>
1945 EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
1946  return veorq_s8(a, b);
1947 }
1948 template <>
1949 EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1950  return a ^ b;
1951 }
1952 template <>
1953 EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1954  return veor_u8(a, b);
1955 }
1956 template <>
1957 EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1958  return veorq_u8(a, b);
1959 }
1960 template <>
1961 EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) {
1962  return veor_s16(a, b);
1963 }
1964 template <>
1965 EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
1966  return veorq_s16(a, b);
1967 }
1968 template <>
1969 EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b) {
1970  return veor_u16(a, b);
1971 }
1972 template <>
1973 EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1974  return veorq_u16(a, b);
1975 }
1976 template <>
1977 EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) {
1978  return veor_s32(a, b);
1979 }
1980 template <>
1981 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1982  return veorq_s32(a, b);
1983 }
1984 template <>
1985 EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1986  return veor_u32(a, b);
1987 }
1988 template <>
1989 EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1990  return veorq_u32(a, b);
1991 }
1992 template <>
1993 EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
1994  return veorq_s64(a, b);
1995 }
1996 template <>
1997 EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1998  return veorq_u64(a, b);
1999 }
2000 
2001 template <>
2002 EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b) {
2003  return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
2004 }
2005 template <>
2006 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
2007  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
2008 }
2009 template <>
2010 EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b) {
2011  return a & ~b;
2012 }
2013 template <>
2014 EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) {
2015  return vbic_s8(a, b);
2016 }
2017 template <>
2018 EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
2019  return vbicq_s8(a, b);
2020 }
2021 template <>
2022 EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
2023  return a & ~b;
2024 }
2025 template <>
2026 EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
2027  return vbic_u8(a, b);
2028 }
2029 template <>
2030 EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2031  return vbicq_u8(a, b);
2032 }
2033 template <>
2034 EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b) {
2035  return vbic_s16(a, b);
2036 }
2037 template <>
2038 EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
2039  return vbicq_s16(a, b);
2040 }
2041 template <>
2042 EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b) {
2043  return vbic_u16(a, b);
2044 }
2045 template <>
2046 EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
2047  return vbicq_u16(a, b);
2048 }
2049 template <>
2050 EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b) {
2051  return vbic_s32(a, b);
2052 }
2053 template <>
2054 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
2055  return vbicq_s32(a, b);
2056 }
2057 template <>
2058 EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
2059  return vbic_u32(a, b);
2060 }
2061 template <>
2062 EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
2063  return vbicq_u32(a, b);
2064 }
2065 template <>
2066 EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
2067  return vbicq_s64(a, b);
2068 }
2069 template <>
2070 EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
2071  return vbicq_u64(a, b);
2072 }
2073 
2074 template <int N>
2075 EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
2076  return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2077 }
2078 template <int N>
2079 EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
2080  return vshr_n_s8(a, N);
2081 }
2082 template <int N>
2083 EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
2084  return vshrq_n_s8(a, N);
2085 }
2086 template <int N>
2087 EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
2088  return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2089 }
2090 template <int N>
2091 EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
2092  return vshr_n_u8(a, N);
2093 }
2094 template <int N>
2095 EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
2096  return vshrq_n_u8(a, N);
2097 }
2098 template <int N>
2099 EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
2100  return vshr_n_s16(a, N);
2101 }
2102 template <int N>
2103 EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
2104  return vshrq_n_s16(a, N);
2105 }
2106 template <int N>
2107 EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
2108  return vshr_n_u16(a, N);
2109 }
2110 template <int N>
2111 EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
2112  return vshrq_n_u16(a, N);
2113 }
2114 template <int N>
2115 EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
2116  return vshr_n_s32(a, N);
2117 }
2118 template <int N>
2119 EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
2120  return vshrq_n_s32(a, N);
2121 }
2122 template <int N>
2123 EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
2124  return vshr_n_u32(a, N);
2125 }
2126 template <int N>
2127 EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
2128  return vshrq_n_u32(a, N);
2129 }
2130 template <int N>
2131 EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
2132  return vshrq_n_s64(a, N);
2133 }
2134 template <int N>
2135 EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
2136  return vshrq_n_u64(a, N);
2137 }
2138 
2139 template <int N>
2140 EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
2141  return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
2142 }
2143 template <int N>
2144 EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
2145  return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
2146 }
2147 template <int N>
2148 EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
2149  return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
2150 }
2151 template <int N>
2152 EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
2153  return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
2154 }
2155 template <int N>
2156 EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
2157  return vshr_n_u8(a, N);
2158 }
2159 template <int N>
2160 EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
2161  return vshrq_n_u8(a, N);
2162 }
2163 template <int N>
2164 EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
2165  return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
2166 }
2167 template <int N>
2168 EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
2169  return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
2170 }
2171 template <int N>
2172 EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
2173  return vshr_n_u16(a, N);
2174 }
2175 template <int N>
2176 EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
2177  return vshrq_n_u16(a, N);
2178 }
2179 template <int N>
2180 EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
2181  return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
2182 }
2183 template <int N>
2184 EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
2185  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
2186 }
2187 template <int N>
2188 EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
2189  return vshr_n_u32(a, N);
2190 }
2191 template <int N>
2192 EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
2193  return vshrq_n_u32(a, N);
2194 }
2195 template <int N>
2196 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
2197  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
2198 }
2199 template <int N>
2200 EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
2201  return vshrq_n_u64(a, N);
2202 }
2203 
2204 template <int N>
2205 EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
2206  return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2207 }
2208 template <int N>
2209 EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
2210  return vshl_n_s8(a, N);
2211 }
2212 template <int N>
2213 EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
2214  return vshlq_n_s8(a, N);
2215 }
2216 template <int N>
2217 EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
2218  return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2219 }
2220 template <int N>
2221 EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
2222  return vshl_n_u8(a, N);
2223 }
2224 template <int N>
2225 EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
2226  return vshlq_n_u8(a, N);
2227 }
2228 template <int N>
2229 EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
2230  return vshl_n_s16(a, N);
2231 }
2232 template <int N>
2233 EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
2234  return vshlq_n_s16(a, N);
2235 }
2236 template <int N>
2237 EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
2238  return vshl_n_u16(a, N);
2239 }
2240 template <int N>
2241 EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
2242  return vshlq_n_u16(a, N);
2243 }
2244 template <int N>
2245 EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
2246  return vshl_n_s32(a, N);
2247 }
2248 template <int N>
2249 EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
2250  return vshlq_n_s32(a, N);
2251 }
2252 template <int N>
2253 EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
2254  return vshl_n_u32(a, N);
2255 }
2256 template <int N>
2257 EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
2258  return vshlq_n_u32(a, N);
2259 }
2260 template <int N>
2261 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
2262  return vshlq_n_s64(a, N);
2263 }
2264 template <int N>
2265 EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
2266  return vshlq_n_u64(a, N);
2267 }
2268 
2269 template <>
2270 EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
2271  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(from));
2272 }
2273 template <>
2274 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
2275  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(from));
2276 }
2277 template <>
2278 EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
2279  Packet4c res;
2280  memcpy(&res, from, sizeof(Packet4c));
2281  return res;
2282 }
2283 template <>
2284 EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
2285  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(from));
2286 }
2287 template <>
2288 EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
2289  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(from));
2290 }
2291 template <>
2292 EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
2293  Packet4uc res;
2294  memcpy(&res, from, sizeof(Packet4uc));
2295  return res;
2296 }
2297 template <>
2298 EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
2299  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(from));
2300 }
2301 template <>
2302 EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
2303  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(from));
2304 }
2305 template <>
2306 EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
2307  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(from));
2308 }
2309 template <>
2310 EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
2311  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(from));
2312 }
2313 template <>
2314 EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
2315  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(from));
2316 }
2317 template <>
2318 EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
2319  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(from));
2320 }
2321 template <>
2322 EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
2323  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(from));
2324 }
2325 template <>
2326 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
2327  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(from));
2328 }
2329 template <>
2330 EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
2331  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(from));
2332 }
2333 template <>
2334 EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
2335  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(from));
2336 }
2337 template <>
2338 EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
2339  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(from));
2340 }
2341 template <>
2342 EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
2343  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(from));
2344 }
2345 
2346 template <>
2347 EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from) {
2348  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
2349 }
2350 template <>
2351 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
2352  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
2353 }
2354 template <>
2355 EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from) {
2356  Packet4c res;
2357  memcpy(&res, from, sizeof(Packet4c));
2358  return res;
2359 }
2360 template <>
2361 EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from) {
2362  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
2363 }
2364 template <>
2365 EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
2366  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
2367 }
2368 template <>
2369 EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from) {
2370  Packet4uc res;
2371  memcpy(&res, from, sizeof(Packet4uc));
2372  return res;
2373 }
2374 template <>
2375 EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from) {
2376  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
2377 }
2378 template <>
2379 EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
2380  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
2381 }
2382 template <>
2383 EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from) {
2384  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
2385 }
2386 template <>
2387 EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
2388  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
2389 }
2390 template <>
2391 EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from) {
2392  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
2393 }
2394 template <>
2395 EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
2396  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
2397 }
2398 template <>
2399 EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from) {
2400  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
2401 }
2402 template <>
2403 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
2404  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
2405 }
2406 template <>
2407 EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from) {
2408  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
2409 }
2410 template <>
2411 EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
2412  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
2413 }
2414 template <>
2415 EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
2416  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
2417 }
2418 template <>
2419 EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
2420  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
2421 }
2422 
2423 template <>
2424 EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from) {
2425  return vld1_dup_f32(from);
2426 }
2427 template <>
2428 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
2429  return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
2430 }
2431 template <>
2432 EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from) {
2433  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
2434  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
2435 }
2436 template <>
2437 EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from) {
2438  const int8x8_t a = vld1_s8(from);
2439  return vzip_s8(a, a).val[0];
2440 }
2441 template <>
2442 EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
2443  const int8x8_t a = vld1_s8(from);
2444  const int8x8x2_t b = vzip_s8(a, a);
2445  return vcombine_s8(b.val[0], b.val[1]);
2446 }
2447 template <>
2448 EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from) {
2449  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
2450  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
2451 }
2452 template <>
2453 EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from) {
2454  const uint8x8_t a = vld1_u8(from);
2455  return vzip_u8(a, a).val[0];
2456 }
2457 template <>
2458 EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
2459  const uint8x8_t a = vld1_u8(from);
2460  const uint8x8x2_t b = vzip_u8(a, a);
2461  return vcombine_u8(b.val[0], b.val[1]);
2462 }
2463 template <>
2464 EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from) {
2465  return vreinterpret_s16_u32(
2466  vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
2467 }
2468 template <>
2469 EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
2470  const int16x4_t a = vld1_s16(from);
2471  const int16x4x2_t b = vzip_s16(a, a);
2472  return vcombine_s16(b.val[0], b.val[1]);
2473 }
2474 template <>
2475 EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from) {
2476  return vreinterpret_u16_u32(
2477  vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
2478 }
2479 template <>
2480 EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
2481  const uint16x4_t a = vld1_u16(from);
2482  const uint16x4x2_t b = vzip_u16(a, a);
2483  return vcombine_u16(b.val[0], b.val[1]);
2484 }
2485 template <>
2486 EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from) {
2487  return vld1_dup_s32(from);
2488 }
2489 template <>
2490 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
2491  return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
2492 }
2493 template <>
2494 EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from) {
2495  return vld1_dup_u32(from);
2496 }
2497 template <>
2498 EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
2499  return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
2500 }
2501 template <>
2502 EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
2503  return vld1q_dup_s64(from);
2504 }
2505 template <>
2506 EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
2507  return vld1q_dup_u64(from);
2508 }
2509 
2510 template <>
2511 EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
2512  return vld1q_dup_f32(from);
2513 }
2514 template <>
2515 EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
2516  return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
2517 }
2518 template <>
2519 EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
2520  return vreinterpret_s8_u32(
2521  vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2522 }
2523 template <>
2524 EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
2525  const int8x8_t a = vreinterpret_s8_u32(
2526  vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2527  const int8x8_t b = vreinterpret_s8_u32(
2528  vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
2529  return vcombine_s8(a, b);
2530 }
2531 template <>
2532 EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
2533  return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
2534 }
2535 template <>
2536 EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
2537  return vreinterpret_u8_u32(
2538  vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2539 }
2540 template <>
2541 EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
2542  const uint8x8_t a = vreinterpret_u8_u32(
2543  vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2544  const uint8x8_t b = vreinterpret_u8_u32(
2545  vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
2546  return vcombine_u8(a, b);
2547 }
2548 template <>
2549 EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
2550  return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
2551 }
2552 template <>
2553 EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
2554  return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
2555 }
2556 template <>
2557 EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
2558  return vld1q_dup_s32(from);
2559 }
2560 template <>
2561 EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
2562  return vld1q_dup_u32(from);
2563 }
2564 
2565 template <>
2566 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
2567  EIGEN_DEBUG_ALIGNED_STORE vst1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(to), from);
2568 }
2569 template <>
2570 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
2571  EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(to), from);
2572 }
2573 template <>
2574 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
2575  memcpy(to, &from, sizeof(from));
2576 }
2577 template <>
2578 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
2579  EIGEN_DEBUG_ALIGNED_STORE vst1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(to), from);
2580 }
2581 template <>
2582 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
2583  EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(to), from);
2584 }
2585 template <>
2586 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
2587  memcpy(to, &from, sizeof(from));
2588 }
2589 template <>
2590 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
2591  EIGEN_DEBUG_ALIGNED_STORE vst1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(to), from);
2592 }
2593 template <>
2594 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
2595  EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(to), from);
2596 }
2597 template <>
2598 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
2599  EIGEN_DEBUG_ALIGNED_STORE vst1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(to), from);
2600 }
2601 template <>
2602 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
2603  EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(to), from);
2604 }
2605 template <>
2606 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
2607  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(to), from);
2608 }
2609 template <>
2610 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
2611  EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(to), from);
2612 }
2613 template <>
2614 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
2615  EIGEN_DEBUG_ALIGNED_STORE vst1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(to), from);
2616 }
2617 template <>
2618 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
2619  EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(to), from);
2620 }
2621 template <>
2622 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
2623  EIGEN_DEBUG_ALIGNED_STORE vst1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(to), from);
2624 }
2625 template <>
2626 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
2627  EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(to), from);
2628 }
2629 template <>
2630 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
2631  EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(to), from);
2632 }
2633 template <>
2634 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
2635  EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(to), from);
2636 }
2637 
2638 template <>
2639 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
2640  EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
2641 }
2642 template <>
2643 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
2644  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
2645 }
2646 template <>
2647 EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from) {
2648  memcpy(to, &from, sizeof(from));
2649 }
2650 template <>
2651 EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from) {
2652  EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
2653 }
2654 template <>
2655 EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
2656  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
2657 }
2658 template <>
2659 EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from) {
2660  memcpy(to, &from, sizeof(from));
2661 }
2662 template <>
2663 EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from) {
2664  EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
2665 }
2666 template <>
2667 EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
2668  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
2669 }
2670 template <>
2671 EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from) {
2672  EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
2673 }
2674 template <>
2675 EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
2676  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
2677 }
2678 template <>
2679 EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from) {
2680  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
2681 }
2682 template <>
2683 EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
2684  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
2685 }
2686 template <>
2687 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from) {
2688  EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
2689 }
2690 template <>
2691 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
2692  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
2693 }
2694 template <>
2695 EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from) {
2696  EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
2697 }
2698 template <>
2699 EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
2700  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
2701 }
2702 template <>
2703 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
2704  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
2705 }
2706 template <>
2707 EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
2708  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
2709 }
2710 
2711 template <>
2712 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride) {
2713  Packet2f res = vld1_dup_f32(from);
2714  res = vld1_lane_f32(from + 1 * stride, res, 1);
2715  return res;
2716 }
2717 template <>
2718 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
2719  Packet4f res = vld1q_dup_f32(from);
2720  res = vld1q_lane_f32(from + 1 * stride, res, 1);
2721  res = vld1q_lane_f32(from + 2 * stride, res, 2);
2722  res = vld1q_lane_f32(from + 3 * stride, res, 3);
2723  return res;
2724 }
2725 template <>
2726 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride) {
2727  Packet4c res;
2728  for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
2729  return res;
2730 }
2731 template <>
2732 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride) {
2733  Packet8c res = vld1_dup_s8(from);
2734  res = vld1_lane_s8(from + 1 * stride, res, 1);
2735  res = vld1_lane_s8(from + 2 * stride, res, 2);
2736  res = vld1_lane_s8(from + 3 * stride, res, 3);
2737  res = vld1_lane_s8(from + 4 * stride, res, 4);
2738  res = vld1_lane_s8(from + 5 * stride, res, 5);
2739  res = vld1_lane_s8(from + 6 * stride, res, 6);
2740  res = vld1_lane_s8(from + 7 * stride, res, 7);
2741  return res;
2742 }
2743 template <>
2744 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
2745  Packet16c res = vld1q_dup_s8(from);
2746  res = vld1q_lane_s8(from + 1 * stride, res, 1);
2747  res = vld1q_lane_s8(from + 2 * stride, res, 2);
2748  res = vld1q_lane_s8(from + 3 * stride, res, 3);
2749  res = vld1q_lane_s8(from + 4 * stride, res, 4);
2750  res = vld1q_lane_s8(from + 5 * stride, res, 5);
2751  res = vld1q_lane_s8(from + 6 * stride, res, 6);
2752  res = vld1q_lane_s8(from + 7 * stride, res, 7);
2753  res = vld1q_lane_s8(from + 8 * stride, res, 8);
2754  res = vld1q_lane_s8(from + 9 * stride, res, 9);
2755  res = vld1q_lane_s8(from + 10 * stride, res, 10);
2756  res = vld1q_lane_s8(from + 11 * stride, res, 11);
2757  res = vld1q_lane_s8(from + 12 * stride, res, 12);
2758  res = vld1q_lane_s8(from + 13 * stride, res, 13);
2759  res = vld1q_lane_s8(from + 14 * stride, res, 14);
2760  res = vld1q_lane_s8(from + 15 * stride, res, 15);
2761  return res;
2762 }
2763 template <>
2764 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride) {
2765  Packet4uc res;
2766  for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
2767  return res;
2768 }
2769 template <>
2770 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride) {
2771  Packet8uc res = vld1_dup_u8(from);
2772  res = vld1_lane_u8(from + 1 * stride, res, 1);
2773  res = vld1_lane_u8(from + 2 * stride, res, 2);
2774  res = vld1_lane_u8(from + 3 * stride, res, 3);
2775  res = vld1_lane_u8(from + 4 * stride, res, 4);
2776  res = vld1_lane_u8(from + 5 * stride, res, 5);
2777  res = vld1_lane_u8(from + 6 * stride, res, 6);
2778  res = vld1_lane_u8(from + 7 * stride, res, 7);
2779  return res;
2780 }
2781 template <>
2782 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
2783  Packet16uc res = vld1q_dup_u8(from);
2784  res = vld1q_lane_u8(from + 1 * stride, res, 1);
2785  res = vld1q_lane_u8(from + 2 * stride, res, 2);
2786  res = vld1q_lane_u8(from + 3 * stride, res, 3);
2787  res = vld1q_lane_u8(from + 4 * stride, res, 4);
2788  res = vld1q_lane_u8(from + 5 * stride, res, 5);
2789  res = vld1q_lane_u8(from + 6 * stride, res, 6);
2790  res = vld1q_lane_u8(from + 7 * stride, res, 7);
2791  res = vld1q_lane_u8(from + 8 * stride, res, 8);
2792  res = vld1q_lane_u8(from + 9 * stride, res, 9);
2793  res = vld1q_lane_u8(from + 10 * stride, res, 10);
2794  res = vld1q_lane_u8(from + 11 * stride, res, 11);
2795  res = vld1q_lane_u8(from + 12 * stride, res, 12);
2796  res = vld1q_lane_u8(from + 13 * stride, res, 13);
2797  res = vld1q_lane_u8(from + 14 * stride, res, 14);
2798  res = vld1q_lane_u8(from + 15 * stride, res, 15);
2799  return res;
2800 }
2801 template <>
2802 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride) {
2803  Packet4s res = vld1_dup_s16(from);
2804  res = vld1_lane_s16(from + 1 * stride, res, 1);
2805  res = vld1_lane_s16(from + 2 * stride, res, 2);
2806  res = vld1_lane_s16(from + 3 * stride, res, 3);
2807  return res;
2808 }
2809 template <>
2810 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
2811  Packet8s res = vld1q_dup_s16(from);
2812  res = vld1q_lane_s16(from + 1 * stride, res, 1);
2813  res = vld1q_lane_s16(from + 2 * stride, res, 2);
2814  res = vld1q_lane_s16(from + 3 * stride, res, 3);
2815  res = vld1q_lane_s16(from + 4 * stride, res, 4);
2816  res = vld1q_lane_s16(from + 5 * stride, res, 5);
2817  res = vld1q_lane_s16(from + 6 * stride, res, 6);
2818  res = vld1q_lane_s16(from + 7 * stride, res, 7);
2819  return res;
2820 }
2821 template <>
2822 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride) {
2823  Packet4us res = vld1_dup_u16(from);
2824  res = vld1_lane_u16(from + 1 * stride, res, 1);
2825  res = vld1_lane_u16(from + 2 * stride, res, 2);
2826  res = vld1_lane_u16(from + 3 * stride, res, 3);
2827  return res;
2828 }
2829 template <>
2830 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
2831  Packet8us res = vld1q_dup_u16(from);
2832  res = vld1q_lane_u16(from + 1 * stride, res, 1);
2833  res = vld1q_lane_u16(from + 2 * stride, res, 2);
2834  res = vld1q_lane_u16(from + 3 * stride, res, 3);
2835  res = vld1q_lane_u16(from + 4 * stride, res, 4);
2836  res = vld1q_lane_u16(from + 5 * stride, res, 5);
2837  res = vld1q_lane_u16(from + 6 * stride, res, 6);
2838  res = vld1q_lane_u16(from + 7 * stride, res, 7);
2839  return res;
2840 }
2841 template <>
2842 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride) {
2843  Packet2i res = vld1_dup_s32(from);
2844  res = vld1_lane_s32(from + 1 * stride, res, 1);
2845  return res;
2846 }
2847 template <>
2848 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
2849  Packet4i res = vld1q_dup_s32(from);
2850  res = vld1q_lane_s32(from + 1 * stride, res, 1);
2851  res = vld1q_lane_s32(from + 2 * stride, res, 2);
2852  res = vld1q_lane_s32(from + 3 * stride, res, 3);
2853  return res;
2854 }
2855 template <>
2856 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride) {
2857  Packet2ui res = vld1_dup_u32(from);
2858  res = vld1_lane_u32(from + 1 * stride, res, 1);
2859  return res;
2860 }
2861 template <>
2862 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
2863  Packet4ui res = vld1q_dup_u32(from);
2864  res = vld1q_lane_u32(from + 1 * stride, res, 1);
2865  res = vld1q_lane_u32(from + 2 * stride, res, 2);
2866  res = vld1q_lane_u32(from + 3 * stride, res, 3);
2867  return res;
2868 }
2869 template <>
2870 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
2871  Packet2l res = vld1q_dup_s64(from);
2872  res = vld1q_lane_s64(from + 1 * stride, res, 1);
2873  return res;
2874 }
2875 template <>
2876 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
2877  Packet2ul res = vld1q_dup_u64(from);
2878  res = vld1q_lane_u64(from + 1 * stride, res, 1);
2879  return res;
2880 }
2881 
2882 template <>
2883 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride) {
2884  vst1_lane_f32(to + stride * 0, from, 0);
2885  vst1_lane_f32(to + stride * 1, from, 1);
2886 }
2887 template <>
2888 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
2889  vst1q_lane_f32(to + stride * 0, from, 0);
2890  vst1q_lane_f32(to + stride * 1, from, 1);
2891  vst1q_lane_f32(to + stride * 2, from, 2);
2892  vst1q_lane_f32(to + stride * 3, from, 3);
2893 }
2894 template <>
2895 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride) {
2896  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
2897 }
2898 template <>
2899 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride) {
2900  vst1_lane_s8(to + stride * 0, from, 0);
2901  vst1_lane_s8(to + stride * 1, from, 1);
2902  vst1_lane_s8(to + stride * 2, from, 2);
2903  vst1_lane_s8(to + stride * 3, from, 3);
2904  vst1_lane_s8(to + stride * 4, from, 4);
2905  vst1_lane_s8(to + stride * 5, from, 5);
2906  vst1_lane_s8(to + stride * 6, from, 6);
2907  vst1_lane_s8(to + stride * 7, from, 7);
2908 }
2909 template <>
2910 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
2911  Index stride) {
2912  vst1q_lane_s8(to + stride * 0, from, 0);
2913  vst1q_lane_s8(to + stride * 1, from, 1);
2914  vst1q_lane_s8(to + stride * 2, from, 2);
2915  vst1q_lane_s8(to + stride * 3, from, 3);
2916  vst1q_lane_s8(to + stride * 4, from, 4);
2917  vst1q_lane_s8(to + stride * 5, from, 5);
2918  vst1q_lane_s8(to + stride * 6, from, 6);
2919  vst1q_lane_s8(to + stride * 7, from, 7);
2920  vst1q_lane_s8(to + stride * 8, from, 8);
2921  vst1q_lane_s8(to + stride * 9, from, 9);
2922  vst1q_lane_s8(to + stride * 10, from, 10);
2923  vst1q_lane_s8(to + stride * 11, from, 11);
2924  vst1q_lane_s8(to + stride * 12, from, 12);
2925  vst1q_lane_s8(to + stride * 13, from, 13);
2926  vst1q_lane_s8(to + stride * 14, from, 14);
2927  vst1q_lane_s8(to + stride * 15, from, 15);
2928 }
2929 template <>
2930 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from,
2931  Index stride) {
2932  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
2933 }
2934 template <>
2935 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from,
2936  Index stride) {
2937  vst1_lane_u8(to + stride * 0, from, 0);
2938  vst1_lane_u8(to + stride * 1, from, 1);
2939  vst1_lane_u8(to + stride * 2, from, 2);
2940  vst1_lane_u8(to + stride * 3, from, 3);
2941  vst1_lane_u8(to + stride * 4, from, 4);
2942  vst1_lane_u8(to + stride * 5, from, 5);
2943  vst1_lane_u8(to + stride * 6, from, 6);
2944  vst1_lane_u8(to + stride * 7, from, 7);
2945 }
2946 template <>
2947 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
2948  Index stride) {
2949  vst1q_lane_u8(to + stride * 0, from, 0);
2950  vst1q_lane_u8(to + stride * 1, from, 1);
2951  vst1q_lane_u8(to + stride * 2, from, 2);
2952  vst1q_lane_u8(to + stride * 3, from, 3);
2953  vst1q_lane_u8(to + stride * 4, from, 4);
2954  vst1q_lane_u8(to + stride * 5, from, 5);
2955  vst1q_lane_u8(to + stride * 6, from, 6);
2956  vst1q_lane_u8(to + stride * 7, from, 7);
2957  vst1q_lane_u8(to + stride * 8, from, 8);
2958  vst1q_lane_u8(to + stride * 9, from, 9);
2959  vst1q_lane_u8(to + stride * 10, from, 10);
2960  vst1q_lane_u8(to + stride * 11, from, 11);
2961  vst1q_lane_u8(to + stride * 12, from, 12);
2962  vst1q_lane_u8(to + stride * 13, from, 13);
2963  vst1q_lane_u8(to + stride * 14, from, 14);
2964  vst1q_lane_u8(to + stride * 15, from, 15);
2965 }
2966 template <>
2967 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from,
2968  Index stride) {
2969  vst1_lane_s16(to + stride * 0, from, 0);
2970  vst1_lane_s16(to + stride * 1, from, 1);
2971  vst1_lane_s16(to + stride * 2, from, 2);
2972  vst1_lane_s16(to + stride * 3, from, 3);
2973 }
2974 template <>
2975 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
2976  Index stride) {
2977  vst1q_lane_s16(to + stride * 0, from, 0);
2978  vst1q_lane_s16(to + stride * 1, from, 1);
2979  vst1q_lane_s16(to + stride * 2, from, 2);
2980  vst1q_lane_s16(to + stride * 3, from, 3);
2981  vst1q_lane_s16(to + stride * 4, from, 4);
2982  vst1q_lane_s16(to + stride * 5, from, 5);
2983  vst1q_lane_s16(to + stride * 6, from, 6);
2984  vst1q_lane_s16(to + stride * 7, from, 7);
2985 }
2986 template <>
2987 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from,
2988  Index stride) {
2989  vst1_lane_u16(to + stride * 0, from, 0);
2990  vst1_lane_u16(to + stride * 1, from, 1);
2991  vst1_lane_u16(to + stride * 2, from, 2);
2992  vst1_lane_u16(to + stride * 3, from, 3);
2993 }
2994 template <>
2995 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
2996  Index stride) {
2997  vst1q_lane_u16(to + stride * 0, from, 0);
2998  vst1q_lane_u16(to + stride * 1, from, 1);
2999  vst1q_lane_u16(to + stride * 2, from, 2);
3000  vst1q_lane_u16(to + stride * 3, from, 3);
3001  vst1q_lane_u16(to + stride * 4, from, 4);
3002  vst1q_lane_u16(to + stride * 5, from, 5);
3003  vst1q_lane_u16(to + stride * 6, from, 6);
3004  vst1q_lane_u16(to + stride * 7, from, 7);
3005 }
3006 template <>
3007 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from,
3008  Index stride) {
3009  vst1_lane_s32(to + stride * 0, from, 0);
3010  vst1_lane_s32(to + stride * 1, from, 1);
3011 }
3012 template <>
3013 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
3014  Index stride) {
3015  vst1q_lane_s32(to + stride * 0, from, 0);
3016  vst1q_lane_s32(to + stride * 1, from, 1);
3017  vst1q_lane_s32(to + stride * 2, from, 2);
3018  vst1q_lane_s32(to + stride * 3, from, 3);
3019 }
3020 template <>
3021 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from,
3022  Index stride) {
3023  vst1_lane_u32(to + stride * 0, from, 0);
3024  vst1_lane_u32(to + stride * 1, from, 1);
3025 }
3026 template <>
3027 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
3028  Index stride) {
3029  vst1q_lane_u32(to + stride * 0, from, 0);
3030  vst1q_lane_u32(to + stride * 1, from, 1);
3031  vst1q_lane_u32(to + stride * 2, from, 2);
3032  vst1q_lane_u32(to + stride * 3, from, 3);
3033 }
3034 template <>
3035 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
3036  Index stride) {
3037  vst1q_lane_s64(to + stride * 0, from, 0);
3038  vst1q_lane_s64(to + stride * 1, from, 1);
3039 }
3040 template <>
3041 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
3042  Index stride) {
3043  vst1q_lane_u64(to + stride * 0, from, 0);
3044  vst1q_lane_u64(to + stride * 1, from, 1);
3045 }
3046 
3047 template <>
3048 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
3049  EIGEN_ARM_PREFETCH(addr);
3050 }
3051 template <>
3052 EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
3053  EIGEN_ARM_PREFETCH(addr);
3054 }
3055 template <>
3056 EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
3057  EIGEN_ARM_PREFETCH(addr);
3058 }
3059 template <>
3060 EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
3061  EIGEN_ARM_PREFETCH(addr);
3062 }
3063 template <>
3064 EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
3065  EIGEN_ARM_PREFETCH(addr);
3066 }
3067 template <>
3068 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
3069  EIGEN_ARM_PREFETCH(addr);
3070 }
3071 template <>
3072 EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
3073  EIGEN_ARM_PREFETCH(addr);
3074 }
3075 template <>
3076 EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
3077  EIGEN_ARM_PREFETCH(addr);
3078 }
3079 template <>
3080 EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
3081  EIGEN_ARM_PREFETCH(addr);
3082 }
3083 
3084 template <>
3085 EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) {
3086  return vget_lane_f32(a, 0);
3087 }
3088 template <>
3089 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
3090  return vgetq_lane_f32(a, 0);
3091 }
3092 template <>
3093 EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) {
3094  return static_cast<int8_t>(a & 0xff);
3095 }
3096 template <>
3097 EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) {
3098  return vget_lane_s8(a, 0);
3099 }
3100 template <>
3101 EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
3102  return vgetq_lane_s8(a, 0);
3103 }
3104 template <>
3105 EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) {
3106  return static_cast<uint8_t>(a & 0xff);
3107 }
3108 template <>
3109 EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) {
3110  return vget_lane_u8(a, 0);
3111 }
3112 template <>
3113 EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
3114  return vgetq_lane_u8(a, 0);
3115 }
3116 template <>
3117 EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) {
3118  return vget_lane_s16(a, 0);
3119 }
3120 template <>
3121 EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
3122  return vgetq_lane_s16(a, 0);
3123 }
3124 template <>
3125 EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) {
3126  return vget_lane_u16(a, 0);
3127 }
3128 template <>
3129 EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
3130  return vgetq_lane_u16(a, 0);
3131 }
3132 template <>
3133 EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) {
3134  return vget_lane_s32(a, 0);
3135 }
3136 template <>
3137 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
3138  return vgetq_lane_s32(a, 0);
3139 }
3140 template <>
3141 EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) {
3142  return vget_lane_u32(a, 0);
3143 }
3144 template <>
3145 EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
3146  return vgetq_lane_u32(a, 0);
3147 }
3148 template <>
3149 EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
3150  return vgetq_lane_s64(a, 0);
3151 }
3152 template <>
3153 EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
3154  return vgetq_lane_u64(a, 0);
3155 }
3156 
3157 template <>
3158 EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) {
3159  return vrev64_f32(a);
3160 }
3161 template <>
3162 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
3163  const float32x4_t a_r64 = vrev64q_f32(a);
3164  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
3165 }
3166 template <>
3167 EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) {
3168  return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3169 }
3170 template <>
3171 EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) {
3172  return vrev64_s8(a);
3173 }
3174 template <>
3175 EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
3176  const int8x16_t a_r64 = vrev64q_s8(a);
3177  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
3178 }
3179 template <>
3180 EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) {
3181  return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
3182 }
3183 template <>
3184 EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) {
3185  return vrev64_u8(a);
3186 }
3187 template <>
3188 EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
3189  const uint8x16_t a_r64 = vrev64q_u8(a);
3190  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
3191 }
3192 template <>
3193 EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) {
3194  return vrev64_s16(a);
3195 }
3196 template <>
3197 EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
3198  const int16x8_t a_r64 = vrev64q_s16(a);
3199  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
3200 }
3201 template <>
3202 EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) {
3203  return vrev64_u16(a);
3204 }
3205 template <>
3206 EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
3207  const uint16x8_t a_r64 = vrev64q_u16(a);
3208  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
3209 }
3210 template <>
3211 EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) {
3212  return vrev64_s32(a);
3213 }
3214 template <>
3215 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
3216  const int32x4_t a_r64 = vrev64q_s32(a);
3217  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
3218 }
3219 template <>
3220 EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) {
3221  return vrev64_u32(a);
3222 }
3223 template <>
3224 EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
3225  const uint32x4_t a_r64 = vrev64q_u32(a);
3226  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
3227 }
3228 template <>
3229 EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
3230  return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
3231 }
3232 template <>
3233 EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
3234  return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
3235 }
3236 
3237 template <>
3238 EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) {
3239  return vabs_f32(a);
3240 }
3241 template <>
3242 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
3243  return vabsq_f32(a);
3244 }
3245 template <>
3246 EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a) {
3247  return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3248 }
3249 template <>
3250 EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) {
3251  return vabs_s8(a);
3252 }
3253 template <>
3254 EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
3255  return vabsq_s8(a);
3256 }
3257 template <>
3258 EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) {
3259  return a;
3260 }
3261 template <>
3262 EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) {
3263  return a;
3264 }
3265 template <>
3266 EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
3267  return a;
3268 }
3269 template <>
3270 EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) {
3271  return vabs_s16(a);
3272 }
3273 template <>
3274 EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
3275  return vabsq_s16(a);
3276 }
3277 template <>
3278 EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) {
3279  return a;
3280 }
3281 template <>
3282 EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
3283  return a;
3284 }
3285 template <>
3286 EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) {
3287  return vabs_s32(a);
3288 }
3289 template <>
3290 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
3291  return vabsq_s32(a);
3292 }
3293 template <>
3294 EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) {
3295  return a;
3296 }
3297 template <>
3298 EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
3299  return a;
3300 }
3301 template <>
3302 EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
3303 #if EIGEN_ARCH_ARM64
3304  return vabsq_s64(a);
3305 #else
3306  return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
3307 #endif
3308 }
3309 template <>
3310 EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
3311  return a;
3312 }
3313 
3314 template <>
3315 EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
3316  return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
3317 }
3318 template <>
3319 EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
3320  return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
3321 }
3322 
3323 template <>
3324 EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent) {
3325  return pfrexp_generic(a, exponent);
3326 }
3327 template <>
3328 EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
3329  return pfrexp_generic(a, exponent);
3330 }
3331 
3332 template <>
3333 EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent) {
3334  return pldexp_generic(a, exponent);
3335 }
3336 template <>
3337 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
3338  return pldexp_generic(a, exponent);
3339 }
3340 
3341 #if EIGEN_ARCH_ARM64
3342 template <>
3343 EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
3344  return vaddv_f32(a);
3345 }
3346 template <>
3347 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
3348  return vaddvq_f32(a);
3349 }
3350 #else
3351 template <>
3352 EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
3353  return vget_lane_f32(vpadd_f32(a, a), 0);
3354 }
3355 template <>
3356 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
3357  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
3358  return vget_lane_f32(vpadd_f32(sum, sum), 0);
3359 }
3360 #endif
3361 template <>
3362 EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a) {
3363  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3364  int8x8_t sum = vpadd_s8(a_dup, a_dup);
3365  sum = vpadd_s8(sum, sum);
3366  return vget_lane_s8(sum, 0);
3367 }
3368 #if EIGEN_ARCH_ARM64
3369 template <>
3370 EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
3371  return vaddv_s8(a);
3372 }
3373 template <>
3374 EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
3375  return vaddvq_s8(a);
3376 }
3377 #else
3378 template <>
3379 EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
3380  int8x8_t sum = vpadd_s8(a, a);
3381  sum = vpadd_s8(sum, sum);
3382  sum = vpadd_s8(sum, sum);
3383  return vget_lane_s8(sum, 0);
3384 }
3385 template <>
3386 EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
3387  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
3388  sum = vpadd_s8(sum, sum);
3389  sum = vpadd_s8(sum, sum);
3390  sum = vpadd_s8(sum, sum);
3391  return vget_lane_s8(sum, 0);
3392 }
3393 #endif
3394 template <>
3395 EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a) {
3396  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3397  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
3398  sum = vpadd_u8(sum, sum);
3399  return vget_lane_u8(sum, 0);
3400 }
3401 #if EIGEN_ARCH_ARM64
3402 template <>
3403 EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
3404  return vaddv_u8(a);
3405 }
3406 template <>
3407 EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
3408  return vaddvq_u8(a);
3409 }
3410 template <>
3411 EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
3412  return vaddv_s16(a);
3413 }
3414 template <>
3415 EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
3416  return vaddvq_s16(a);
3417 }
3418 template <>
3419 EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
3420  return vaddv_u16(a);
3421 }
3422 template <>
3423 EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
3424  return vaddvq_u16(a);
3425 }
3426 template <>
3427 EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
3428  return vaddv_s32(a);
3429 }
3430 template <>
3431 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
3432  return vaddvq_s32(a);
3433 }
3434 template <>
3435 EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
3436  return vaddv_u32(a);
3437 }
3438 template <>
3439 EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
3440  return vaddvq_u32(a);
3441 }
3442 template <>
3443 EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
3444  return vaddvq_s64(a);
3445 }
3446 template <>
3447 EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
3448  return vaddvq_u64(a);
3449 }
3450 #else
3451 template <>
3452 EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
3453  uint8x8_t sum = vpadd_u8(a, a);
3454  sum = vpadd_u8(sum, sum);
3455  sum = vpadd_u8(sum, sum);
3456  return vget_lane_u8(sum, 0);
3457 }
3458 template <>
3459 EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
3460  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
3461  sum = vpadd_u8(sum, sum);
3462  sum = vpadd_u8(sum, sum);
3463  sum = vpadd_u8(sum, sum);
3464  return vget_lane_u8(sum, 0);
3465 }
3466 template <>
3467 EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
3468  const int16x4_t sum = vpadd_s16(a, a);
3469  return vget_lane_s16(vpadd_s16(sum, sum), 0);
3470 }
3471 template <>
3472 EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
3473  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
3474  sum = vpadd_s16(sum, sum);
3475  sum = vpadd_s16(sum, sum);
3476  return vget_lane_s16(sum, 0);
3477 }
3478 template <>
3479 EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
3480  const uint16x4_t sum = vpadd_u16(a, a);
3481  return vget_lane_u16(vpadd_u16(sum, sum), 0);
3482 }
3483 template <>
3484 EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
3485  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
3486  sum = vpadd_u16(sum, sum);
3487  sum = vpadd_u16(sum, sum);
3488  return vget_lane_u16(sum, 0);
3489 }
3490 template <>
3491 EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
3492  return vget_lane_s32(vpadd_s32(a, a), 0);
3493 }
3494 template <>
3495 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
3496  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
3497  return vget_lane_s32(vpadd_s32(sum, sum), 0);
3498 }
3499 template <>
3500 EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
3501  return vget_lane_u32(vpadd_u32(a, a), 0);
3502 }
3503 template <>
3504 EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
3505  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
3506  return vget_lane_u32(vpadd_u32(sum, sum), 0);
3507 }
3508 template <>
3509 EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
3510  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
3511 }
3512 template <>
3513 EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
3514  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
3515 }
3516 #endif
3517 
3518 template <>
3519 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
3520  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
3521 }
3522 template <>
3523 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
3524  return vadd_s8(vget_high_s8(a), vget_low_s8(a));
3525 }
3526 template <>
3527 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
3528  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
3529 }
3530 template <>
3531 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
3532  return vadd_u8(vget_high_u8(a), vget_low_u8(a));
3533 }
3534 template <>
3535 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
3536  return vadd_s16(vget_high_s16(a), vget_low_s16(a));
3537 }
3538 template <>
3539 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
3540  return vadd_u16(vget_high_u16(a), vget_low_u16(a));
3541 }
3542 
3543 // Other reduction functions:
3544 // mul
3545 template <>
3546 EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a) {
3547  return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
3548 }
3549 template <>
3550 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
3551  return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
3552 }
3553 template <>
3554 EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a) {
3555  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
3556  prod = vmul_s8(prod, vrev16_s8(prod));
3557  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
3558 }
3559 template <>
3560 EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a) {
3561  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
3562  prod = vmul_s8(prod, vrev32_s8(prod));
3563  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
3564 }
3565 template <>
3566 EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
3567  return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
3568 }
3569 template <>
3570 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a) {
3571  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
3572  prod = vmul_u8(prod, vrev16_u8(prod));
3573  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
3574 }
3575 template <>
3576 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a) {
3577  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
3578  prod = vmul_u8(prod, vrev32_u8(prod));
3579  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
3580 }
3581 template <>
3582 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
3583  return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
3584 }
3585 template <>
3586 EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a) {
3587  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
3588  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3589 }
3590 template <>
3591 EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
3592  int16x4_t prod;
3593 
3594  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3595  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
3596  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3597  prod = vmul_s16(prod, vrev32_s16(prod));
3598  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3599  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3600 }
3601 template <>
3602 EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a) {
3603  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
3604  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3605 }
3606 template <>
3607 EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
3608  uint16x4_t prod;
3609 
3610  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3611  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
3612  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3613  prod = vmul_u16(prod, vrev32_u16(prod));
3614  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3615  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3616 }
3617 template <>
3618 EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a) {
3619  return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
3620 }
3621 template <>
3622 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
3623  return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
3624 }
3625 template <>
3626 EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a) {
3627  return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
3628 }
3629 template <>
3630 EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
3631  return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
3632 }
3633 template <>
3634 EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
3635  return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
3636 }
3637 template <>
3638 EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
3639  return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
3640 }
3641 
3642 // min
3643 #if EIGEN_ARCH_ARM64
3644 template <>
3645 EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
3646  return vminv_f32(a);
3647 }
3648 template <>
3649 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
3650  return vminvq_f32(a);
3651 }
3652 #else
3653 template <>
3654 EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
3655  return vget_lane_f32(vpmin_f32(a, a), 0);
3656 }
3657 template <>
3658 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
3659  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
3660  return vget_lane_f32(vpmin_f32(min, min), 0);
3661 }
3662 #endif
3663 template <>
3664 EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a) {
3665  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3666  int8x8_t min = vpmin_s8(a_dup, a_dup);
3667  min = vpmin_s8(min, min);
3668  return vget_lane_s8(min, 0);
3669 }
3670 #if EIGEN_ARCH_ARM64
3671 template <>
3672 EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
3673  return vminv_s8(a);
3674 }
3675 template <>
3676 EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
3677  return vminvq_s8(a);
3678 }
3679 #else
3680 template <>
3681 EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
3682  int8x8_t min = vpmin_s8(a, a);
3683  min = vpmin_s8(min, min);
3684  min = vpmin_s8(min, min);
3685  return vget_lane_s8(min, 0);
3686 }
3687 template <>
3688 EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
3689  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
3690  min = vpmin_s8(min, min);
3691  min = vpmin_s8(min, min);
3692  min = vpmin_s8(min, min);
3693  return vget_lane_s8(min, 0);
3694 }
3695 #endif
3696 template <>
3697 EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a) {
3698  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3699  uint8x8_t min = vpmin_u8(a_dup, a_dup);
3700  min = vpmin_u8(min, min);
3701  return vget_lane_u8(min, 0);
3702 }
3703 #if EIGEN_ARCH_ARM64
3704 template <>
3705 EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
3706  return vminv_u8(a);
3707 }
3708 template <>
3709 EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
3710  return vminvq_u8(a);
3711 }
3712 template <>
3713 EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
3714  return vminv_s16(a);
3715 }
3716 template <>
3717 EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
3718  return vminvq_s16(a);
3719 }
3720 template <>
3721 EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
3722  return vminv_u16(a);
3723 }
3724 template <>
3725 EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
3726  return vminvq_u16(a);
3727 }
3728 template <>
3729 EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
3730  return vminv_s32(a);
3731 }
3732 template <>
3733 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
3734  return vminvq_s32(a);
3735 }
3736 template <>
3737 EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
3738  return vminv_u32(a);
3739 }
3740 template <>
3741 EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
3742  return vminvq_u32(a);
3743 }
3744 #else
3745 template <>
3746 EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
3747  uint8x8_t min = vpmin_u8(a, a);
3748  min = vpmin_u8(min, min);
3749  min = vpmin_u8(min, min);
3750  return vget_lane_u8(min, 0);
3751 }
3752 template <>
3753 EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
3754  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
3755  min = vpmin_u8(min, min);
3756  min = vpmin_u8(min, min);
3757  min = vpmin_u8(min, min);
3758  return vget_lane_u8(min, 0);
3759 }
3760 template <>
3761 EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
3762  const int16x4_t min = vpmin_s16(a, a);
3763  return vget_lane_s16(vpmin_s16(min, min), 0);
3764 }
3765 template <>
3766 EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
3767  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
3768  min = vpmin_s16(min, min);
3769  min = vpmin_s16(min, min);
3770  return vget_lane_s16(min, 0);
3771 }
3772 template <>
3773 EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
3774  const uint16x4_t min = vpmin_u16(a, a);
3775  return vget_lane_u16(vpmin_u16(min, min), 0);
3776 }
3777 template <>
3778 EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
3779  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
3780  min = vpmin_u16(min, min);
3781  min = vpmin_u16(min, min);
3782  return vget_lane_u16(min, 0);
3783 }
3784 template <>
3785 EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
3786  return vget_lane_s32(vpmin_s32(a, a), 0);
3787 }
3788 template <>
3789 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
3790  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
3791  return vget_lane_s32(vpmin_s32(min, min), 0);
3792 }
3793 template <>
3794 EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
3795  return vget_lane_u32(vpmin_u32(a, a), 0);
3796 }
3797 template <>
3798 EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
3799  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
3800  return vget_lane_u32(vpmin_u32(min, min), 0);
3801 }
3802 #endif
3803 template <>
3804 EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
3805  return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3806 }
3807 template <>
3808 EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
3809  return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3810 }
3811 
3812 // max
3813 #if EIGEN_ARCH_ARM64
3814 template <>
3815 EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
3816  return vmaxv_f32(a);
3817 }
3818 template <>
3819 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
3820  return vmaxvq_f32(a);
3821 }
3822 #else
3823 template <>
3824 EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
3825  return vget_lane_f32(vpmax_f32(a, a), 0);
3826 }
3827 template <>
3828 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
3829  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
3830  return vget_lane_f32(vpmax_f32(max, max), 0);
3831 }
3832 #endif
3833 template <>
3834 EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a) {
3835  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3836  int8x8_t max = vpmax_s8(a_dup, a_dup);
3837  max = vpmax_s8(max, max);
3838  return vget_lane_s8(max, 0);
3839 }
3840 #if EIGEN_ARCH_ARM64
3841 template <>
3842 EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
3843  return vmaxv_s8(a);
3844 }
3845 template <>
3846 EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
3847  return vmaxvq_s8(a);
3848 }
3849 #else
3850 template <>
3851 EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
3852  int8x8_t max = vpmax_s8(a, a);
3853  max = vpmax_s8(max, max);
3854  max = vpmax_s8(max, max);
3855  return vget_lane_s8(max, 0);
3856 }
3857 template <>
3858 EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
3859  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
3860  max = vpmax_s8(max, max);
3861  max = vpmax_s8(max, max);
3862  max = vpmax_s8(max, max);
3863  return vget_lane_s8(max, 0);
3864 }
3865 #endif
3866 template <>
3867 EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a) {
3868  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3869  uint8x8_t max = vpmax_u8(a_dup, a_dup);
3870  max = vpmax_u8(max, max);
3871  return vget_lane_u8(max, 0);
3872 }
3873 #if EIGEN_ARCH_ARM64
3874 template <>
3875 EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
3876  return vmaxv_u8(a);
3877 }
3878 template <>
3879 EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
3880  return vmaxvq_u8(a);
3881 }
3882 template <>
3883 EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
3884  return vmaxv_s16(a);
3885 }
3886 template <>
3887 EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
3888  return vmaxvq_s16(a);
3889 }
3890 template <>
3891 EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
3892  return vmaxv_u16(a);
3893 }
3894 template <>
3895 EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
3896  return vmaxvq_u16(a);
3897 }
3898 template <>
3899 EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
3900  return vmaxv_s32(a);
3901 }
3902 template <>
3903 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
3904  return vmaxvq_s32(a);
3905 }
3906 template <>
3907 EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
3908  return vmaxv_u32(a);
3909 }
3910 template <>
3911 EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
3912  return vmaxvq_u32(a);
3913 }
3914 #else
3915 template <>
3916 EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
3917  uint8x8_t max = vpmax_u8(a, a);
3918  max = vpmax_u8(max, max);
3919  max = vpmax_u8(max, max);
3920  return vget_lane_u8(max, 0);
3921 }
3922 template <>
3923 EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
3924  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
3925  max = vpmax_u8(max, max);
3926  max = vpmax_u8(max, max);
3927  max = vpmax_u8(max, max);
3928  return vget_lane_u8(max, 0);
3929 }
3930 template <>
3931 EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
3932  const int16x4_t max = vpmax_s16(a, a);
3933  return vget_lane_s16(vpmax_s16(max, max), 0);
3934 }
3935 template <>
3936 EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
3937  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
3938  max = vpmax_s16(max, max);
3939  max = vpmax_s16(max, max);
3940  return vget_lane_s16(max, 0);
3941 }
3942 template <>
3943 EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
3944  const uint16x4_t max = vpmax_u16(a, a);
3945  return vget_lane_u16(vpmax_u16(max, max), 0);
3946 }
3947 template <>
3948 EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
3949  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
3950  max = vpmax_u16(max, max);
3951  max = vpmax_u16(max, max);
3952  return vget_lane_u16(max, 0);
3953 }
3954 template <>
3955 EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
3956  return vget_lane_s32(vpmax_s32(a, a), 0);
3957 }
3958 template <>
3959 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
3960  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
3961  return vget_lane_s32(vpmax_s32(max, max), 0);
3962 }
3963 template <>
3964 EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
3965  return vget_lane_u32(vpmax_u32(a, a), 0);
3966 }
3967 template <>
3968 EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
3969  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
3970  return vget_lane_u32(vpmax_u32(max, max), 0);
3971 }
3972 #endif
3973 template <>
3974 EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
3975  return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3976 }
3977 template <>
3978 EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
3979  return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3980 }
3981 
3982 template <>
3983 EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
3984  uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
3985  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
3986 }
3987 
3988 // Helpers for ptranspose.
3989 namespace detail {
3990 
3991 template <typename Packet>
3992 void zip_in_place(Packet& p1, Packet& p2);
3993 
3994 template <>
3995 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
3996  const float32x2x2_t tmp = vzip_f32(p1, p2);
3997  p1 = tmp.val[0];
3998  p2 = tmp.val[1];
3999 }
4000 
4001 template <>
4002 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
4003  const float32x4x2_t tmp = vzipq_f32(p1, p2);
4004  p1 = tmp.val[0];
4005  p2 = tmp.val[1];
4006 }
4007 
4008 template <>
4009 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
4010  const int8x8x2_t tmp = vzip_s8(p1, p2);
4011  p1 = tmp.val[0];
4012  p2 = tmp.val[1];
4013 }
4014 
4015 template <>
4016 EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
4017  const int8x16x2_t tmp = vzipq_s8(p1, p2);
4018  p1 = tmp.val[0];
4019  p2 = tmp.val[1];
4020 }
4021 
4022 template <>
4023 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
4024  const uint8x8x2_t tmp = vzip_u8(p1, p2);
4025  p1 = tmp.val[0];
4026  p2 = tmp.val[1];
4027 }
4028 
4029 template <>
4030 EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
4031  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
4032  p1 = tmp.val[0];
4033  p2 = tmp.val[1];
4034 }
4035 
4036 template <>
4037 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
4038  const int32x2x2_t tmp = vzip_s32(p1, p2);
4039  p1 = tmp.val[0];
4040  p2 = tmp.val[1];
4041 }
4042 
4043 template <>
4044 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
4045  const int32x4x2_t tmp = vzipq_s32(p1, p2);
4046  p1 = tmp.val[0];
4047  p2 = tmp.val[1];
4048 }
4049 
4050 template <>
4051 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
4052  const uint32x2x2_t tmp = vzip_u32(p1, p2);
4053  p1 = tmp.val[0];
4054  p2 = tmp.val[1];
4055 }
4056 
4057 template <>
4058 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
4059  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
4060  p1 = tmp.val[0];
4061  p2 = tmp.val[1];
4062 }
4063 
4064 template <>
4065 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
4066  const int16x4x2_t tmp = vzip_s16(p1, p2);
4067  p1 = tmp.val[0];
4068  p2 = tmp.val[1];
4069 }
4070 
4071 template <>
4072 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
4073  const int16x8x2_t tmp = vzipq_s16(p1, p2);
4074  p1 = tmp.val[0];
4075  p2 = tmp.val[1];
4076 }
4077 
4078 template <>
4079 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
4080  const uint16x4x2_t tmp = vzip_u16(p1, p2);
4081  p1 = tmp.val[0];
4082  p2 = tmp.val[1];
4083 }
4084 
4085 template <>
4086 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
4087  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
4088  p1 = tmp.val[0];
4089  p2 = tmp.val[1];
4090 }
4091 
4092 template <typename Packet>
4093 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
4094  zip_in_place(kernel.packet[0], kernel.packet[1]);
4095 }
4096 
4097 template <typename Packet>
4098 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
4099  zip_in_place(kernel.packet[0], kernel.packet[2]);
4100  zip_in_place(kernel.packet[1], kernel.packet[3]);
4101  zip_in_place(kernel.packet[0], kernel.packet[1]);
4102  zip_in_place(kernel.packet[2], kernel.packet[3]);
4103 }
4104 
4105 template <typename Packet>
4106 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
4107  zip_in_place(kernel.packet[0], kernel.packet[4]);
4108  zip_in_place(kernel.packet[1], kernel.packet[5]);
4109  zip_in_place(kernel.packet[2], kernel.packet[6]);
4110  zip_in_place(kernel.packet[3], kernel.packet[7]);
4111 
4112  zip_in_place(kernel.packet[0], kernel.packet[2]);
4113  zip_in_place(kernel.packet[1], kernel.packet[3]);
4114  zip_in_place(kernel.packet[4], kernel.packet[6]);
4115  zip_in_place(kernel.packet[5], kernel.packet[7]);
4116 
4117  zip_in_place(kernel.packet[0], kernel.packet[1]);
4118  zip_in_place(kernel.packet[2], kernel.packet[3]);
4119  zip_in_place(kernel.packet[4], kernel.packet[5]);
4120  zip_in_place(kernel.packet[6], kernel.packet[7]);
4121 }
4122 
4123 template <typename Packet>
4124 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
4125  EIGEN_UNROLL_LOOP
4126  for (int i = 0; i < 4; ++i) {
4127  const int m = (1 << i);
4128  EIGEN_UNROLL_LOOP
4129  for (int j = 0; j < m; ++j) {
4130  const int n = (1 << (3 - i));
4131  EIGEN_UNROLL_LOOP
4132  for (int k = 0; k < n; ++k) {
4133  const int idx = 2 * j * n + k;
4134  zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
4135  }
4136  }
4137  }
4138 }
4139 
4140 } // namespace detail
4141 
4142 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
4143  detail::ptranspose_impl(kernel);
4144 }
4145 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
4146  detail::ptranspose_impl(kernel);
4147 }
4148 
4149 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
4150  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
4151  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
4152 
4153  const int8x8x2_t zip8 = vzip_s8(a, b);
4154  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
4155 
4156  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
4157  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
4158  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
4159  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
4160 }
4161 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
4162  detail::ptranspose_impl(kernel);
4163 }
4164 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
4165  detail::ptranspose_impl(kernel);
4166 }
4167 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
4168  detail::ptranspose_impl(kernel);
4169 }
4170 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
4171  detail::ptranspose_impl(kernel);
4172 }
4173 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
4174  detail::ptranspose_impl(kernel);
4175 }
4176 
4177 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
4178  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
4179  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
4180 
4181  const uint8x8x2_t zip8 = vzip_u8(a, b);
4182  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
4183 
4184  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
4185  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
4186  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
4187  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
4188 }
4189 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
4190  detail::ptranspose_impl(kernel);
4191 }
4192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
4193  detail::ptranspose_impl(kernel);
4194 }
4195 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
4196  detail::ptranspose_impl(kernel);
4197 }
4198 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
4199  detail::ptranspose_impl(kernel);
4200 }
4201 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
4202  detail::ptranspose_impl(kernel);
4203 }
4204 
4205 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
4206  detail::ptranspose_impl(kernel);
4207 }
4208 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
4209  detail::ptranspose_impl(kernel);
4210 }
4211 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
4212  detail::ptranspose_impl(kernel);
4213 }
4214 
4215 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
4216  detail::ptranspose_impl(kernel);
4217 }
4218 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
4219  detail::ptranspose_impl(kernel);
4220 }
4221 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
4222  detail::ptranspose_impl(kernel);
4223 }
4224 
4225 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
4226  detail::ptranspose_impl(kernel);
4227 }
4228 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
4229  detail::ptranspose_impl(kernel);
4230 }
4231 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
4232  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
4233 }
4234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
4235  detail::ptranspose_impl(kernel);
4236 }
4237 
4238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
4239 #if EIGEN_ARCH_ARM64
4240  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
4241  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
4242  kernel.packet[0] = tmp1;
4243 #else
4244  const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
4245  {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
4246 
4247  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
4248  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
4249 #endif
4250 }
4251 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
4252 #if EIGEN_ARCH_ARM64
4253  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
4254  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
4255  kernel.packet[0] = tmp1;
4256 #else
4257  const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
4258  {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
4259 
4260  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
4261  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
4262 #endif
4263 }
4264 
4265 template <>
4266 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(const Packet2f& mask, const Packet2f& a, const Packet2f& b) {
4267  return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
4268 }
4269 template <>
4270 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
4271  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
4272 }
4273 template <>
4274 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) {
4275  return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
4276 }
4277 template <>
4278 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
4279  return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
4280 }
4281 template <>
4282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) {
4283  return vbsl_u8(mask, a, b);
4284 }
4285 template <>
4286 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
4287  const Packet16uc& b) {
4288  return vbslq_u8(mask, a, b);
4289 }
4290 template <>
4291 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) {
4292  return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
4293 }
4294 template <>
4295 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
4296  return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
4297 }
4298 template <>
4299 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) {
4300  return vbsl_u16(mask, a, b);
4301 }
4302 template <>
4303 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
4304  return vbslq_u16(mask, a, b);
4305 }
4306 template <>
4307 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) {
4308  return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
4309 }
4310 template <>
4311 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
4312  return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
4313 }
4314 template <>
4315 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) {
4316  return vbsl_u32(mask, a, b);
4317 }
4318 template <>
4319 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
4320  return vbslq_u32(mask, a, b);
4321 }
4322 template <>
4323 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
4324  return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
4325 }
4326 template <>
4327 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
4328  return vbslq_u64(mask, a, b);
4329 }
4330 
4331 // Use armv8 rounding intinsics if available.
4332 #if EIGEN_ARCH_ARMV8
4333 template <>
4334 EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
4335  return vrndn_f32(a);
4336 }
4337 
4338 template <>
4339 EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
4340  return vrndnq_f32(a);
4341 }
4342 
4343 template <>
4344 EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
4345  return vrndm_f32(a);
4346 }
4347 
4348 template <>
4349 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
4350  return vrndmq_f32(a);
4351 }
4352 
4353 template <>
4354 EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
4355  return vrndp_f32(a);
4356 }
4357 
4358 template <>
4359 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
4360  return vrndpq_f32(a);
4361 }
4362 
4363 template <>
4364 EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(const Packet2f& a) {
4365  return vrnda_f32(a);
4366 }
4367 
4368 template <>
4369 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
4370  return vrndaq_f32(a);
4371 }
4372 
4373 template <>
4374 EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(const Packet2f& a) {
4375  return vrnd_f32(a);
4376 }
4377 
4378 template <>
4379 EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
4380  return vrndq_f32(a);
4381 }
4382 #endif
4383 
4390 template <>
4391 EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
4392  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
4393  uint8x8_t res = vdup_n_u8(0);
4394  uint8x8_t add = vdup_n_u8(0x8);
4395  for (int i = 0; i < 4; i++) {
4396  const uint8x8_t temp = vorr_u8(res, add);
4397  res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
4398  add = vshr_n_u8(add, 1);
4399  }
4400  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
4401 }
4403 template <>
4404 EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
4405  uint8x8_t res = vdup_n_u8(0);
4406  uint8x8_t add = vdup_n_u8(0x8);
4407  for (int i = 0; i < 4; i++) {
4408  const uint8x8_t temp = vorr_u8(res, add);
4409  res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
4410  add = vshr_n_u8(add, 1);
4411  }
4412  return res;
4413 }
4415 template <>
4416 EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
4417  uint8x16_t res = vdupq_n_u8(0);
4418  uint8x16_t add = vdupq_n_u8(0x8);
4419  for (int i = 0; i < 4; i++) {
4420  const uint8x16_t temp = vorrq_u8(res, add);
4421  res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
4422  add = vshrq_n_u8(add, 1);
4423  }
4424  return res;
4425 }
4427 template <>
4428 EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
4429  uint16x4_t res = vdup_n_u16(0);
4430  uint16x4_t add = vdup_n_u16(0x80);
4431  for (int i = 0; i < 8; i++) {
4432  const uint16x4_t temp = vorr_u16(res, add);
4433  res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
4434  add = vshr_n_u16(add, 1);
4435  }
4436  return res;
4437 }
4439 template <>
4440 EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
4441  uint16x8_t res = vdupq_n_u16(0);
4442  uint16x8_t add = vdupq_n_u16(0x80);
4443  for (int i = 0; i < 8; i++) {
4444  const uint16x8_t temp = vorrq_u16(res, add);
4445  res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
4446  add = vshrq_n_u16(add, 1);
4447  }
4448  return res;
4449 }
4451 template <>
4452 EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
4453  uint32x2_t res = vdup_n_u32(0);
4454  uint32x2_t add = vdup_n_u32(0x8000);
4455  for (int i = 0; i < 16; i++) {
4456  const uint32x2_t temp = vorr_u32(res, add);
4457  res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
4458  add = vshr_n_u32(add, 1);
4459  }
4460  return res;
4461 }
4463 template <>
4464 EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
4465  uint32x4_t res = vdupq_n_u32(0);
4466  uint32x4_t add = vdupq_n_u32(0x8000);
4467  for (int i = 0; i < 16; i++) {
4468  const uint32x4_t temp = vorrq_u32(res, add);
4469  res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
4470  add = vshrq_n_u32(add, 1);
4471  }
4472  return res;
4473 }
4474 
4475 EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
4476  // Compute approximate reciprocal sqrt.
4477  // Does not correctly handle +/- 0 or +inf
4478  float32x4_t result = vrsqrteq_f32(a);
4479  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4480  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4481  return result;
4482 }
4483 
4484 EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
4485  // Compute approximate reciprocal sqrt.
4486  // Does not correctly handle +/- 0 or +inf
4487  float32x2_t result = vrsqrte_f32(a);
4488  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4489  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4490  return result;
4491 }
4492 
4493 template <typename Packet>
4494 Packet prsqrt_float_common(const Packet& a) {
4495  const Packet cst_zero = pzero(a);
4496  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4497  Packet return_zero = pcmp_eq(a, cst_inf);
4498  Packet return_inf = pcmp_eq(a, cst_zero);
4499  Packet result = prsqrt_float_unsafe(a);
4500  result = pselect(return_inf, por(cst_inf, a), result);
4501  result = pandnot(result, return_zero);
4502  return result;
4503 }
4504 
4505 template <>
4506 EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
4507  return prsqrt_float_common(a);
4508 }
4509 
4510 template <>
4511 EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
4512  return prsqrt_float_common(a);
4513 }
4514 
4515 template <>
4516 EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
4517  // Compute approximate reciprocal.
4518  float32x4_t result = vrecpeq_f32(a);
4519  result = vmulq_f32(vrecpsq_f32(a, result), result);
4520  result = vmulq_f32(vrecpsq_f32(a, result), result);
4521  return result;
4522 }
4523 
4524 template <>
4525 EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) {
4526  // Compute approximate reciprocal.
4527  float32x2_t result = vrecpe_f32(a);
4528  result = vmul_f32(vrecps_f32(a, result), result);
4529  result = vmul_f32(vrecps_f32(a, result), result);
4530  return result;
4531 }
4532 
4533 // Unfortunately vsqrt_f32 is only available for A64.
4534 #if EIGEN_ARCH_ARM64
4535 template <>
4536 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
4537  return vsqrtq_f32(a);
4538 }
4539 
4540 template <>
4541 EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
4542  return vsqrt_f32(a);
4543 }
4544 
4545 template <>
4546 EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) {
4547  return vdivq_f32(a, b);
4548 }
4549 
4550 template <>
4551 EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) {
4552  return vdiv_f32(a, b);
4553 }
4554 #else
4555 template <typename Packet>
4556 EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
4557  const Packet cst_zero = pzero(a);
4558  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4559 
4560  Packet result = pmul(a, prsqrt_float_unsafe(a));
4561  Packet a_is_zero = pcmp_eq(a, cst_zero);
4562  Packet a_is_inf = pcmp_eq(a, cst_inf);
4563  Packet return_a = por(a_is_zero, a_is_inf);
4564 
4565  result = pselect(return_a, a, result);
4566  return result;
4567 }
4568 
4569 template <>
4570 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
4571  return psqrt_float_common(a);
4572 }
4573 
4574 template <>
4575 EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
4576  return psqrt_float_common(a);
4577 }
4578 
4579 template <typename Packet>
4580 EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
4581  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
4582  // avoid underflow with the following manipulation:
4583  // a / b = f * (a * reciprocal(f * b))
4584 
4585  const Packet cst_one = pset1<Packet>(1.0f);
4586  const Packet cst_quarter = pset1<Packet>(0.25f);
4587  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
4588 
4589  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
4590  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
4591  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
4592  return result;
4593 }
4594 
4595 template <>
4596 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
4597  return pdiv_float_common(a, b);
4598 }
4599 
4600 template <>
4601 EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
4602  return pdiv_float_common(a, b);
4603 }
4604 #endif
4605 
4606 //---------- bfloat16 ----------
4607 // TODO: Add support for native armv8.6-a bfloat16_t
4608 
4609 // TODO: Guard if we have native bfloat16 support
4610 typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
4611 
4612 template <>
4613 struct is_arithmetic<Packet4bf> {
4614  enum { value = true };
4615 };
4616 
4617 template <>
4618 struct packet_traits<bfloat16> : default_packet_traits {
4619  typedef Packet4bf type;
4620  typedef Packet4bf half;
4621  enum {
4622  Vectorizable = 1,
4623  AlignedOnScalar = 1,
4624  size = 4,
4625 
4626  HasCmp = 1,
4627  HasAdd = 1,
4628  HasSub = 1,
4629  HasShift = 1,
4630  HasMul = 1,
4631  HasNegate = 1,
4632  HasAbs = 1,
4633  HasArg = 0,
4634  HasAbs2 = 1,
4635  HasAbsDiff = 1,
4636  HasMin = 1,
4637  HasMax = 1,
4638  HasConj = 1,
4639  HasSetLinear = 1,
4640  HasBlend = 0,
4641  HasDiv = 1,
4642  HasSin = EIGEN_FAST_MATH,
4643  HasCos = EIGEN_FAST_MATH,
4644  HasLog = 1,
4645  HasExp = 1,
4646  HasSqrt = 0,
4647  HasTanh = EIGEN_FAST_MATH,
4648  HasErf = EIGEN_FAST_MATH,
4649  HasBessel = 0, // Issues with accuracy.
4650  HasNdtri = 0
4651  };
4652 };
4653 
4654 template <>
4655 struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
4656 
4657 namespace detail {
4658 template <>
4659 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
4660  const uint16x4x2_t tmp = vzip_u16(p1, p2);
4661  p1 = tmp.val[0];
4662  p2 = tmp.val[1];
4663 }
4664 } // namespace detail
4665 
4666 EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) {
4667  // See the scalar implementation in BFloat16.h for a comprehensible explanation
4668  // of this fast rounding algorithm
4669  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
4670 
4671  // lsb = (input >> 16) & 1
4672  Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
4673 
4674  // rounding_bias = 0x7fff + lsb
4675  Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
4676 
4677  // input += rounding_bias
4678  input = vaddq_u32(input, rounding_bias);
4679 
4680  // input = input >> 16
4681  input = vshrq_n_u32(input, 16);
4682 
4683  // Replace float-nans by bfloat16-nans, that is 0x7fc0
4684  const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
4685  const Packet4ui mask = vceqq_f32(p, p);
4686  input = vbslq_u32(mask, input, bf16_nan);
4687 
4688  // output = static_cast<uint16_t>(input)
4689  return vmovn_u32(input);
4690 }
4691 
4692 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) {
4693  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
4694 }
4695 
4696 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
4697 
4698 template <>
4699 EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
4700  return Packet4bf(pset1<Packet4us>(from.value));
4701 }
4702 
4703 template <>
4704 EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
4705  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
4706 }
4707 
4708 template <>
4709 EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
4710  return Packet4bf(
4711  pload<Packet4us>(reinterpret_cast<const uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(from))));
4712 }
4713 
4714 template <>
4715 EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
4716  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4717 }
4718 
4719 template <>
4720 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
4721  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(
4722  reinterpret_cast<uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(to)), from);
4723 }
4724 
4725 template <>
4726 EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from) {
4727  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4728 }
4729 
4730 template <>
4731 EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from) {
4732  return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4733 }
4734 
4735 template <>
4736 EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
4737  return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
4738 }
4739 
4740 template <>
4741 EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4742  return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4743 }
4744 template <>
4745 EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4746  return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4747 }
4748 
4749 template <>
4750 EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4751  return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4752 }
4753 
4754 template <>
4755 EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4756  return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4757 }
4758 template <>
4759 EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4760  return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4761 }
4762 
4763 template <>
4764 EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4765  return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4766 }
4767 
4768 template <>
4769 EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a) {
4770  return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
4771 }
4772 
4773 template <>
4774 EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a, const Packet4bf& b) {
4775  return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
4776 }
4777 
4778 template <>
4779 EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a, const Packet4bf& b) {
4780  return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
4781 }
4782 
4783 template <>
4784 EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a, const Packet4bf& b) {
4785  return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
4786 }
4787 
4788 template <>
4789 EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a, const Packet4bf& b) {
4790  return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
4791 }
4792 
4793 template <>
4794 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) {
4795  return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
4796 }
4797 
4798 template <>
4799 EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a) {
4800  return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
4801 }
4802 
4803 template <>
4804 EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a) {
4805  return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
4806 }
4807 
4808 template <>
4809 EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
4810  return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
4811 }
4812 
4813 template <>
4814 EIGEN_STRONG_INLINE Packet4bf pround<Packet4bf>(const Packet4bf& a) {
4815  return F32ToBf16(pround<Packet4f>(Bf16ToF32(a)));
4816 }
4817 
4818 template <>
4819 EIGEN_STRONG_INLINE Packet4bf ptrunc<Packet4bf>(const Packet4bf& a) {
4820  return F32ToBf16(ptrunc<Packet4f>(Bf16ToF32(a)));
4821 }
4822 
4823 template <>
4824 EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
4825  return a;
4826 }
4827 
4828 template <>
4829 EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4830  return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4831 }
4832 
4833 template <>
4834 EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4835  return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4836 }
4837 
4838 template <>
4839 EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4840  return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4841 }
4842 
4843 template <>
4844 EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4845  return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4846 }
4847 
4848 template <>
4849 EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4850  return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4851 }
4852 
4853 template <>
4854 EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4855  return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4856 }
4857 
4858 template <>
4859 EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4860  return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4861 }
4862 
4863 template <>
4864 EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4865  return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4866 }
4867 
4868 template <>
4869 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride) {
4870  return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
4871 }
4872 
4873 template <>
4874 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride) {
4875  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
4876 }
4877 
4878 template <>
4879 EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a) {
4880  return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
4881 }
4882 
4883 template <>
4884 EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a) {
4885  return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
4886 }
4887 
4888 template <>
4889 EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a) {
4890  return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
4891 }
4892 
4893 template <>
4894 EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a) {
4895  return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
4896 }
4897 
4898 template <>
4899 EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a) {
4900  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
4901 }
4902 
4903 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
4904  detail::ptranspose_impl(kernel);
4905 }
4906 
4907 template <>
4908 EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4909  return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4910 }
4911 
4912 template <>
4913 EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4914  return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4915 }
4916 
4917 template <>
4918 EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4919  return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4920 }
4921 
4922 template <>
4923 EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4924  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4925 }
4926 
4927 template <>
4928 EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4929  return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4930 }
4931 
4932 template <>
4933 EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
4934  return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
4935 }
4936 
4937 //---------- double ----------
4938 
4939 // Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrinsics for double.
4940 // Confirmed at least with __apple_build_version__ = 6000054.
4941 #if EIGEN_COMP_CLANGAPPLE
4942 // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
4943 // https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
4944 // major toolchain updates.
4945 #define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
4946 #else
4947 #define EIGEN_APPLE_DOUBLE_NEON_BUG 0
4948 #endif
4949 
4950 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
4951 
4952 #if EIGEN_COMP_GNUC
4953 // Bug 907: workaround missing declarations of the following two functions in the ADK
4954 // Defining these functions as templates ensures that if these intrinsics are
4955 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
4956 // and has lower priority in overload resolution.
4957 // This doesn't work with MSVC though, since the function names are macros.
4958 template <typename T>
4959 uint64x2_t vreinterpretq_u64_f64(T a) {
4960  return (uint64x2_t)a;
4961 }
4962 
4963 template <typename T>
4964 float64x2_t vreinterpretq_f64_u64(T a) {
4965  return (float64x2_t)a;
4966 }
4967 #endif
4968 
4969 #if EIGEN_COMP_MSVC_STRICT
4970 typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
4971 typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
4972 
4973 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
4974  double from[2] = {a, b};
4975  return vld1q_f64(from);
4976 }
4977 
4978 #else
4979 typedef float64x2_t Packet2d;
4980 typedef float64x1_t Packet1d;
4981 
4982 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
4983 #endif
4984 
4985 // functionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
4986 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
4987 // for fast inversion of matrices of size 4.
4988 EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
4989  const double* a = reinterpret_cast<const double*>(&m);
4990  const double* b = reinterpret_cast<const double*>(&n);
4991  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
4992  return res;
4993 }
4994 
4995 EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
4996  return shuffle(a, b, mask);
4997 }
4998 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
4999 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
5000 #define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
5001 
5002 template <>
5003 struct packet_traits<double> : default_packet_traits {
5004  typedef Packet2d type;
5005  typedef Packet2d half;
5006  enum {
5007  Vectorizable = 1,
5008  AlignedOnScalar = 1,
5009  size = 2,
5010 
5011  HasCmp = 1,
5012  HasAdd = 1,
5013  HasSub = 1,
5014  HasShift = 1,
5015  HasMul = 1,
5016  HasNegate = 1,
5017  HasAbs = 1,
5018  HasArg = 0,
5019  HasAbs2 = 1,
5020  HasAbsDiff = 1,
5021  HasMin = 1,
5022  HasMax = 1,
5023  HasConj = 1,
5024  HasSetLinear = 1,
5025  HasBlend = 0,
5026 
5027  HasDiv = 1,
5028 
5029 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5030  HasExp = 1,
5031  HasLog = 1,
5032  HasPow = 1,
5033  HasATan = 1,
5034  HasATanh = 1,
5035 #endif
5036  HasSin = EIGEN_FAST_MATH,
5037  HasCos = EIGEN_FAST_MATH,
5038  HasSqrt = 1,
5039  HasRsqrt = 1,
5040  HasCbrt = 1,
5041  HasTanh = EIGEN_FAST_MATH,
5042  HasErf = EIGEN_FAST_MATH,
5043  HasErfc = EIGEN_FAST_MATH
5044  };
5045 };
5046 
5047 template <>
5048 struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
5049  using integer_packet = Packet2l;
5050 };
5051 
5052 template <>
5053 EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(const Packet2d& /*a*/) {
5054  return vdupq_n_f64(0.0);
5055 }
5056 
5057 template <>
5058 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
5059  return vdupq_n_f64(from);
5060 }
5061 
5062 template <>
5063 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
5064  const double c[] = {0.0, 1.0};
5065  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
5066 }
5067 
5068 template <>
5069 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
5070  return vaddq_f64(a, b);
5071 }
5072 
5073 template <>
5074 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
5075  return vsubq_f64(a, b);
5076 }
5077 
5078 template <>
5079 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
5080 template <>
5081 EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
5082  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
5083  return padd(a, pxor(mask, b));
5084 }
5085 
5086 template <>
5087 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
5088  return vnegq_f64(a);
5089 }
5090 
5091 template <>
5092 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
5093  return a;
5094 }
5095 
5096 template <>
5097 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
5098  return vmulq_f64(a, b);
5099 }
5100 
5101 template <>
5102 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
5103  return vdivq_f64(a, b);
5104 }
5105 
5106 #ifdef EIGEN_VECTORIZE_FMA
5107 // See bug 936. See above comment about FMA for float.
5108 template <>
5109 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5110  return vfmaq_f64(c, a, b);
5111 }
5112 template <>
5113 EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5114  return vfmsq_f64(c, a, b);
5115 }
5116 #else
5117 template <>
5118 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5119  return vmlaq_f64(c, a, b);
5120 }
5121 template <>
5122 EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5123  return vmlsq_f64(c, a, b);
5124 }
5125 #endif
5126 template <>
5127 EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5128  return pnegate(pnmadd(a, b, c));
5129 }
5130 template <>
5131 EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5132  return pnegate(pmadd(a, b, c));
5133 }
5134 template <>
5135 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
5136  return vminq_f64(a, b);
5137 }
5138 
5139 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5140 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5141 // systems).
5142 template <>
5143 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
5144  return vminnmq_f64(a, b);
5145 }
5146 template <>
5147 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
5148  return vmaxnmq_f64(a, b);
5149 }
5150 
5151 #endif
5152 
5153 template <>
5154 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
5155  return pmin<Packet2d>(a, b);
5156 }
5157 
5158 template <>
5159 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
5160  return vmaxq_f64(a, b);
5161 }
5162 
5163 template <>
5164 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
5165  return pmax<Packet2d>(a, b);
5166 }
5167 
5168 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
5169 template <>
5170 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
5171  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5172 }
5173 
5174 template <>
5175 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
5176  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5177 }
5178 
5179 template <>
5180 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
5181  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5182 }
5183 
5184 template <>
5185 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
5186  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5187 }
5188 
5189 template <>
5190 EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
5191  return vreinterpretq_f64_u64(vcleq_f64(a, b));
5192 }
5193 
5194 template <>
5195 EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
5196  return vreinterpretq_f64_u64(vcltq_f64(a, b));
5197 }
5198 
5199 template <>
5200 EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
5201  return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
5202 }
5203 
5204 template <>
5205 EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
5206  return vreinterpretq_f64_u64(vceqq_f64(a, b));
5207 }
5208 
5209 template <>
5210 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
5211  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(from));
5212 }
5213 
5214 template <>
5215 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
5216  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
5217 }
5218 
5219 template <>
5220 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
5221  return vld1q_dup_f64(from);
5222 }
5223 template <>
5224 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
5225  EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(to), from);
5226 }
5227 
5228 template <>
5229 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
5230  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
5231 }
5232 
5233 template <>
5234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
5235  Packet2d res = pset1<Packet2d>(0.0);
5236  res = vld1q_lane_f64(from + 0 * stride, res, 0);
5237  res = vld1q_lane_f64(from + 1 * stride, res, 1);
5238  return res;
5239 }
5240 
5241 template <>
5242 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
5243  vst1q_lane_f64(to + stride * 0, from, 0);
5244  vst1q_lane_f64(to + stride * 1, from, 1);
5245 }
5246 
5247 template <>
5248 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
5249  EIGEN_ARM_PREFETCH(addr);
5250 }
5251 
5252 // FIXME only store the 2 first elements ?
5253 template <>
5254 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
5255  return vgetq_lane_f64(a, 0);
5256 }
5257 
5258 template <>
5259 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
5260  return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
5261 }
5262 
5263 template <>
5264 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
5265  return vabsq_f64(a);
5266 }
5267 
5268 template <>
5269 EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
5270  return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
5271 }
5272 
5273 template <>
5274 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
5275  return vaddvq_f64(a);
5276 }
5277 
5278 // Other reduction functions:
5279 // mul
5280 #if EIGEN_COMP_CLANGAPPLE
5281 template <>
5282 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
5283  return (vget_low_f64(a) * vget_high_f64(a))[0];
5284 }
5285 #else
5286 template <>
5287 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
5288  return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
5289 }
5290 #endif
5291 
5292 // min
5293 template <>
5294 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
5295  return vminvq_f64(a);
5296 }
5297 
5298 // max
5299 template <>
5300 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
5301  return vmaxvq_f64(a);
5302 }
5303 
5304 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
5305  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
5306  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
5307 
5308  kernel.packet[0] = tmp1;
5309  kernel.packet[1] = tmp2;
5310 }
5311 
5312 template <>
5313 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
5314  return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
5315 }
5316 
5317 template <>
5318 EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
5319  return vrndnq_f64(a);
5320 }
5321 
5322 template <>
5323 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
5324  return vrndmq_f64(a);
5325 }
5326 
5327 template <>
5328 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
5329  return vrndpq_f64(a);
5330 }
5331 
5332 template <>
5333 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
5334  return vrndaq_f64(a);
5335 }
5336 
5337 template <>
5338 EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
5339  return vrndq_f64(a);
5340 }
5341 
5342 template <>
5343 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
5344  return pldexp_generic(a, exponent);
5345 }
5346 
5347 template <>
5348 EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
5349  return pfrexp_generic(a, exponent);
5350 }
5351 
5352 template <>
5353 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
5354  return vreinterpretq_f64_u64(vdupq_n_u64(from));
5355 }
5356 
5357 template <>
5358 EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
5359  // Do Newton iterations for 1/sqrt(x).
5360  return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
5361 }
5362 
5363 template <>
5364 EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x) {
5365  return vsqrtq_f64(_x);
5366 }
5367 
5368 #endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5369 
5370 // Do we have an fp16 types and supporting Neon intrinsics?
5371 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
5372 typedef float16x4_t Packet4hf;
5373 typedef float16x8_t Packet8hf;
5374 
5375 template <>
5376 struct packet_traits<Eigen::half> : default_packet_traits {
5377  typedef Packet8hf type;
5378  typedef Packet4hf half;
5379  enum {
5380  Vectorizable = 1,
5381  AlignedOnScalar = 1,
5382  size = 8,
5383 
5384  HasCmp = 1,
5385  HasCast = 1,
5386  HasAdd = 1,
5387  HasSub = 1,
5388  HasShift = 1,
5389  HasMul = 1,
5390  HasNegate = 1,
5391  HasAbs = 1,
5392  HasArg = 0,
5393  HasAbs2 = 1,
5394  HasAbsDiff = 0,
5395  HasMin = 1,
5396  HasMax = 1,
5397  HasConj = 1,
5398  HasSetLinear = 1,
5399  HasBlend = 0,
5400  HasInsert = 1,
5401  HasReduxp = 1,
5402  HasDiv = 1,
5403  HasSin = 0,
5404  HasCos = 0,
5405  HasLog = 0,
5406  HasExp = 0,
5407  HasTanh = packet_traits<float>::HasTanh, // tanh<half> calls tanh<float>
5408  HasSqrt = 1,
5409  HasRsqrt = 1,
5410  HasErf = EIGEN_FAST_MATH,
5411  HasBessel = 0, // Issues with accuracy.
5412  HasNdtri = 0
5413  };
5414 };
5415 
5416 template <>
5417 struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
5418 template <>
5419 struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
5420  using half = Packet4hf;
5421 };
5422 
5423 template <>
5424 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
5425  return vadd_f16(vget_low_f16(a), vget_high_f16(a));
5426 }
5427 
5428 template <>
5429 EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
5430  return vdupq_n_f16(from.x);
5431 }
5432 
5433 template <>
5434 EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
5435  return vdup_n_f16(from.x);
5436 }
5437 
5438 template <>
5439 EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
5440  const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
5441  Packet8hf countdown = vld1q_f16(f);
5442  return vaddq_f16(pset1<Packet8hf>(a), countdown);
5443 }
5444 
5445 template <>
5446 EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
5447  const float16_t f[] = {0, 1, 2, 3};
5448  Packet4hf countdown = vld1_f16(f);
5449  return vadd_f16(pset1<Packet4hf>(a), countdown);
5450 }
5451 
5452 template <>
5453 EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5454  return vaddq_f16(a, b);
5455 }
5456 
5457 template <>
5458 EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5459  return vadd_f16(a, b);
5460 }
5461 
5462 template <>
5463 EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5464  return vsubq_f16(a, b);
5465 }
5466 
5467 template <>
5468 EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5469  return vsub_f16(a, b);
5470 }
5471 
5472 template <>
5473 EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
5474  return vnegq_f16(a);
5475 }
5476 
5477 template <>
5478 EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
5479  return vneg_f16(a);
5480 }
5481 
5482 template <>
5483 EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
5484  return a;
5485 }
5486 
5487 template <>
5488 EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
5489  return a;
5490 }
5491 
5492 template <>
5493 EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5494  return vmulq_f16(a, b);
5495 }
5496 
5497 template <>
5498 EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5499  return vmul_f16(a, b);
5500 }
5501 
5502 template <>
5503 EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5504  return vdivq_f16(a, b);
5505 }
5506 
5507 template <>
5508 EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5509  return vdiv_f16(a, b);
5510 }
5511 
5512 template <>
5513 EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5514  return vfmaq_f16(c, a, b);
5515 }
5516 
5517 template <>
5518 EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5519  return vfma_f16(c, a, b);
5520 }
5521 
5522 template <>
5523 EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5524  return vfmsq_f16(c, a, b);
5525 }
5526 
5527 template <>
5528 EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5529  return vfms_f16(c, a, b);
5530 }
5531 
5532 template <>
5533 EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5534  return pnegate(pnmadd(a, b, c));
5535 }
5536 
5537 template <>
5538 EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5539  return pnegate(pnmadd(a, b, c));
5540 }
5541 
5542 template <>
5543 EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5544  return pnegate(pmadd(a, b, c));
5545 }
5546 
5547 template <>
5548 EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5549  return pnegate(pmadd(a, b, c));
5550 }
5551 
5552 template <>
5553 EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5554  return vminq_f16(a, b);
5555 }
5556 
5557 template <>
5558 EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5559  return vmin_f16(a, b);
5560 }
5561 
5562 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5563 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5564 // systems).
5565 template <>
5566 EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5567  return vminnm_f16(a, b);
5568 }
5569 template <>
5570 EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5571  return vminnmq_f16(a, b);
5572 }
5573 #endif
5574 
5575 template <>
5576 EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5577  return pmin<Packet4hf>(a, b);
5578 }
5579 
5580 template <>
5581 EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5582  return pmin<Packet8hf>(a, b);
5583 }
5584 
5585 template <>
5586 EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5587  return vmaxq_f16(a, b);
5588 }
5589 
5590 template <>
5591 EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5592  return vmax_f16(a, b);
5593 }
5594 
5595 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5596 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5597 // systems).
5598 template <>
5599 EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5600  return vmaxnm_f16(a, b);
5601 }
5602 template <>
5603 EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5604  return vmaxnmq_f16(a, b);
5605 }
5606 #endif
5607 
5608 template <>
5609 EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5610  return pmax<Packet4hf>(a, b);
5611 }
5612 
5613 template <>
5614 EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5615  return pmax<Packet8hf>(a, b);
5616 }
5617 
5618 #define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
5619  template <> \
5620  EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
5621  return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
5622  }
5623 
5624 #define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
5625  template <> \
5626  EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
5627  return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
5628  }
5629 
5630 EIGEN_MAKE_ARM_FP16_CMP_8(eq)
5631 EIGEN_MAKE_ARM_FP16_CMP_8(lt)
5632 EIGEN_MAKE_ARM_FP16_CMP_8(le)
5633 
5634 EIGEN_MAKE_ARM_FP16_CMP_4(eq)
5635 EIGEN_MAKE_ARM_FP16_CMP_4(lt)
5636 EIGEN_MAKE_ARM_FP16_CMP_4(le)
5637 
5638 #undef EIGEN_MAKE_ARM_FP16_CMP_8
5639 #undef EIGEN_MAKE_ARM_FP16_CMP_4
5640 
5641 template <>
5642 EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5643  return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
5644 }
5645 
5646 template <>
5647 EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5648  return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
5649 }
5650 
5651 template <>
5652 EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
5653  return vrndnq_f16(a);
5654 }
5655 
5656 template <>
5657 EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
5658  return vrndn_f16(a);
5659 }
5660 
5661 template <>
5662 EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
5663  return vrndmq_f16(a);
5664 }
5665 
5666 template <>
5667 EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
5668  return vrndm_f16(a);
5669 }
5670 
5671 template <>
5672 EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
5673  return vrndpq_f16(a);
5674 }
5675 
5676 template <>
5677 EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
5678  return vrndp_f16(a);
5679 }
5680 
5681 template <>
5682 EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(const Packet8hf& a) {
5683  return vrndaq_f16(a);
5684 }
5685 
5686 template <>
5687 EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(const Packet4hf& a) {
5688  return vrnda_f16(a);
5689 }
5690 
5691 template <>
5692 EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(const Packet8hf& a) {
5693  return vrndq_f16(a);
5694 }
5695 
5696 template <>
5697 EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(const Packet4hf& a) {
5698  return vrnd_f16(a);
5699 }
5700 
5701 template <>
5702 EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
5703  return vsqrtq_f16(a);
5704 }
5705 
5706 template <>
5707 EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
5708  return vsqrt_f16(a);
5709 }
5710 
5711 template <>
5712 EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5713  return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5714 }
5715 
5716 template <>
5717 EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5718  return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5719 }
5720 
5721 template <>
5722 EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5723  return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5724 }
5725 
5726 template <>
5727 EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5728  return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5729 }
5730 
5731 template <>
5732 EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5733  return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5734 }
5735 
5736 template <>
5737 EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5738  return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5739 }
5740 
5741 template <>
5742 EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5743  return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5744 }
5745 
5746 template <>
5747 EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5748  return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5749 }
5750 
5751 template <>
5752 EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
5753  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(
5754  reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(from)));
5755 }
5756 
5757 template <>
5758 EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
5759  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(
5760  reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(from)));
5761 }
5762 
5763 template <>
5764 EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
5765  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5766 }
5767 
5768 template <>
5769 EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
5770  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
5771 }
5772 
5773 template <>
5774 EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
5775  Packet8hf packet;
5776  packet[0] = from[0].x;
5777  packet[1] = from[0].x;
5778  packet[2] = from[1].x;
5779  packet[3] = from[1].x;
5780  packet[4] = from[2].x;
5781  packet[5] = from[2].x;
5782  packet[6] = from[3].x;
5783  packet[7] = from[3].x;
5784  return packet;
5785 }
5786 
5787 template <>
5788 EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
5789  float16x4_t packet;
5790  float16_t* tmp;
5791  tmp = (float16_t*)&packet;
5792  tmp[0] = from[0].x;
5793  tmp[1] = from[0].x;
5794  tmp[2] = from[1].x;
5795  tmp[3] = from[1].x;
5796  return packet;
5797 }
5798 
5799 template <>
5800 EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
5801  Packet4hf lo, hi;
5802  lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
5803  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
5804  return vcombine_f16(lo, hi);
5805 }
5806 
5807 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
5808  return vsetq_lane_f16(b.x, a, 0);
5809 }
5810 
5811 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
5812  return vset_lane_f16(b.x, a, 0);
5813 }
5814 
5815 template <>
5816 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
5817  return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
5818 }
5819 
5820 template <>
5821 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
5822  return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
5823 }
5824 
5825 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
5826  return vsetq_lane_f16(b.x, a, 7);
5827 }
5828 
5829 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
5830  return vset_lane_f16(b.x, a, 3);
5831 }
5832 
5833 template <>
5834 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5835  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(
5836  reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(to)), from);
5837 }
5838 
5839 template <>
5840 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5841  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(
5842  reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(to)), from);
5843 }
5844 
5845 template <>
5846 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5847  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5848 }
5849 
5850 template <>
5851 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5852  EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5853 }
5854 
5855 template <>
5856 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
5857  Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
5858  res = vsetq_lane_f16(from[0 * stride].x, res, 0);
5859  res = vsetq_lane_f16(from[1 * stride].x, res, 1);
5860  res = vsetq_lane_f16(from[2 * stride].x, res, 2);
5861  res = vsetq_lane_f16(from[3 * stride].x, res, 3);
5862  res = vsetq_lane_f16(from[4 * stride].x, res, 4);
5863  res = vsetq_lane_f16(from[5 * stride].x, res, 5);
5864  res = vsetq_lane_f16(from[6 * stride].x, res, 6);
5865  res = vsetq_lane_f16(from[7 * stride].x, res, 7);
5866  return res;
5867 }
5868 
5869 template <>
5870 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
5871  Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
5872  res = vset_lane_f16(from[0 * stride].x, res, 0);
5873  res = vset_lane_f16(from[1 * stride].x, res, 1);
5874  res = vset_lane_f16(from[2 * stride].x, res, 2);
5875  res = vset_lane_f16(from[3 * stride].x, res, 3);
5876  return res;
5877 }
5878 
5879 template <>
5880 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
5881  Index stride) {
5882  to[stride * 0].x = vgetq_lane_f16(from, 0);
5883  to[stride * 1].x = vgetq_lane_f16(from, 1);
5884  to[stride * 2].x = vgetq_lane_f16(from, 2);
5885  to[stride * 3].x = vgetq_lane_f16(from, 3);
5886  to[stride * 4].x = vgetq_lane_f16(from, 4);
5887  to[stride * 5].x = vgetq_lane_f16(from, 5);
5888  to[stride * 6].x = vgetq_lane_f16(from, 6);
5889  to[stride * 7].x = vgetq_lane_f16(from, 7);
5890 }
5891 
5892 template <>
5893 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
5894  Index stride) {
5895  to[stride * 0].x = vget_lane_f16(from, 0);
5896  to[stride * 1].x = vget_lane_f16(from, 1);
5897  to[stride * 2].x = vget_lane_f16(from, 2);
5898  to[stride * 3].x = vget_lane_f16(from, 3);
5899 }
5900 
5901 template <>
5902 EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
5903  EIGEN_ARM_PREFETCH(addr);
5904 }
5905 
5906 template <>
5907 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
5908  float16_t x[8];
5909  vst1q_f16(x, a);
5910  Eigen::half h;
5911  h.x = x[0];
5912  return h;
5913 }
5914 
5915 template <>
5916 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
5917  float16_t x[4];
5918  vst1_f16(x, a);
5919  Eigen::half h;
5920  h.x = x[0];
5921  return h;
5922 }
5923 
5924 template <>
5925 EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
5926  float16x4_t a_lo, a_hi;
5927  Packet8hf a_r64;
5928 
5929  a_r64 = vrev64q_f16(a);
5930  a_lo = vget_low_f16(a_r64);
5931  a_hi = vget_high_f16(a_r64);
5932  return vcombine_f16(a_hi, a_lo);
5933 }
5934 
5935 template <>
5936 EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
5937  return vrev64_f16(a);
5938 }
5939 
5940 template <>
5941 EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
5942  return vabsq_f16(a);
5943 }
5944 
5945 template <>
5946 EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
5947  return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
5948 }
5949 
5950 template <>
5951 EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
5952  return vabs_f16(a);
5953 }
5954 
5955 template <>
5956 EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
5957  return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
5958 }
5959 
5960 template <>
5961 EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
5962  float16x4_t a_lo, a_hi, sum;
5963 
5964  a_lo = vget_low_f16(a);
5965  a_hi = vget_high_f16(a);
5966  sum = vpadd_f16(a_lo, a_hi);
5967  sum = vpadd_f16(sum, sum);
5968  sum = vpadd_f16(sum, sum);
5969 
5970  Eigen::half h;
5971  h.x = vget_lane_f16(sum, 0);
5972  return h;
5973 }
5974 
5975 template <>
5976 EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
5977  float16x4_t sum;
5978 
5979  sum = vpadd_f16(a, a);
5980  sum = vpadd_f16(sum, sum);
5981  Eigen::half h;
5982  h.x = vget_lane_f16(sum, 0);
5983  return h;
5984 }
5985 
5986 template <>
5987 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
5988  float16x4_t a_lo, a_hi, prod;
5989 
5990  a_lo = vget_low_f16(a);
5991  a_hi = vget_high_f16(a);
5992  prod = vmul_f16(a_lo, a_hi);
5993  prod = vmul_f16(prod, vrev64_f16(prod));
5994 
5995  Eigen::half h;
5996  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
5997  return h;
5998 }
5999 
6000 template <>
6001 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
6002  float16x4_t prod;
6003  prod = vmul_f16(a, vrev64_f16(a));
6004  Eigen::half h;
6005  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6006  return h;
6007 }
6008 
6009 template <>
6010 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
6011  Eigen::half h;
6012  h.x = vminvq_f16(a);
6013  return h;
6014 }
6015 
6016 template <>
6017 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
6018  Eigen::half h;
6019  h.x = vminv_f16(a);
6020  return h;
6021 }
6022 
6023 template <>
6024 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
6025  Eigen::half h;
6026  h.x = vmaxvq_f16(a);
6027  return h;
6028 }
6029 
6030 template <>
6031 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
6032  Eigen::half h;
6033  h.x = vmaxv_f16(a);
6034  return h;
6035 }
6036 
6037 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
6038  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
6039  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
6040 
6041  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
6042  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
6043 
6044  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
6045  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
6046  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
6047  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
6048 }
6049 
6050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
6051  EIGEN_ALIGN16 float16x4x4_t tmp_x4;
6052  float16_t* tmp = (float16_t*)&kernel;
6053  tmp_x4 = vld4_f16(tmp);
6054 
6055  kernel.packet[0] = tmp_x4.val[0];
6056  kernel.packet[1] = tmp_x4.val[1];
6057  kernel.packet[2] = tmp_x4.val[2];
6058  kernel.packet[3] = tmp_x4.val[3];
6059 }
6060 
6061 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
6062  float16x8x2_t T_1[4];
6063 
6064  T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
6065  T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
6066  T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
6067  T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
6068 
6069  float16x8x2_t T_2[4];
6070  T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
6071  T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
6072  T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
6073  T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
6074 
6075  float16x8x2_t T_3[4];
6076  T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
6077  T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
6078  T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
6079  T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
6080 
6081  kernel.packet[0] = T_3[0].val[0];
6082  kernel.packet[1] = T_3[2].val[0];
6083  kernel.packet[2] = T_3[1].val[0];
6084  kernel.packet[3] = T_3[3].val[0];
6085  kernel.packet[4] = T_3[0].val[1];
6086  kernel.packet[5] = T_3[2].val[1];
6087  kernel.packet[6] = T_3[1].val[1];
6088  kernel.packet[7] = T_3[3].val[1];
6089 }
6090 #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
6091 
6092 } // end namespace internal
6093 
6094 } // end namespace Eigen
6095 
6096 #endif // EIGEN_PACKET_MATH_NEON_H
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82