12 #ifndef EIGEN_PACKET_MATH_NEON_H 13 #define EIGEN_PACKET_MATH_NEON_H 16 #include "../../InternalHeaderCheck.h" 22 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 23 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 30 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 34 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 38 #if EIGEN_COMP_MSVC_STRICT 44 typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
45 typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
46 typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
47 typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
48 typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
49 typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
50 typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
51 typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
52 typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
53 typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
54 typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
55 typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
56 typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
57 typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
58 typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
59 typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
60 typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
61 typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
63 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(
float a,
float b,
float c,
float d) {
64 float from[4] = {a, b, c, d};
65 return vld1q_f32(from);
68 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(
float a,
float b) {
69 float from[2] = {a, b};
70 return vld1_f32(from);
75 typedef float32x2_t Packet2f;
76 typedef float32x4_t Packet4f;
77 typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
78 typedef int8x8_t Packet8c;
79 typedef int8x16_t Packet16c;
80 typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
81 typedef uint8x8_t Packet8uc;
82 typedef uint8x16_t Packet16uc;
83 typedef int16x4_t Packet4s;
84 typedef int16x8_t Packet8s;
85 typedef uint16x4_t Packet4us;
86 typedef uint16x8_t Packet8us;
87 typedef int32x2_t Packet2i;
88 typedef int32x4_t Packet4i;
89 typedef uint32x2_t Packet2ui;
90 typedef uint32x4_t Packet4ui;
91 typedef int64x2_t Packet2l;
92 typedef uint64x2_t Packet2ul;
94 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(
float a,
float b,
float c,
float d) {
return Packet4f{a, b, c, d}; }
95 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(
float a,
float b) {
return Packet2f{a, b}; }
97 #endif // EIGEN_COMP_MSVC_STRICT 99 EIGEN_STRONG_INLINE Packet4f shuffle1(
const Packet4f& m,
int mask) {
100 const float* a =
reinterpret_cast<const float*
>(&m);
102 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
110 template <
bool interleave>
111 EIGEN_STRONG_INLINE Packet4f shuffle2(
const Packet4f& m,
const Packet4f& n,
int mask) {
112 const float* a =
reinterpret_cast<const float*
>(&m);
113 const float* b =
reinterpret_cast<const float*
>(&n);
115 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
120 EIGEN_STRONG_INLINE Packet4f shuffle2<true>(
const Packet4f& m,
const Packet4f& n,
int mask) {
121 const float* a =
reinterpret_cast<const float*
>(&m);
122 const float* b =
reinterpret_cast<const float*
>(&n);
124 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
128 EIGEN_STRONG_INLINE
static int eigen_neon_shuffle_mask(
int p,
int q,
int r,
int s) {
129 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
132 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(
const Packet4f& a,
int p,
int q,
int r,
int s) {
133 return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
135 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(
const Packet4f& a,
const Packet4f& b,
int p,
int q,
int r,
int s) {
136 return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
138 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(
const Packet4f& a,
const Packet4f& b) {
139 return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
141 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(
const Packet4f& a,
const Packet4f& b) {
142 return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
144 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(
const Packet4f& a,
const Packet4f& b) {
145 return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
147 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(
const Packet4f& a,
const Packet4f& b) {
148 return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
150 #define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p)) 152 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X) 154 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \ 155 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X)) 157 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X) 159 #if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC 163 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :); 164 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC 165 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); 167 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) 169 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :); 172 #define EIGEN_ARM_PREFETCH(ADDR) 176 struct packet_traits<float> : default_packet_traits {
177 typedef Packet4f type;
178 typedef Packet2f half;
200 HasSin = EIGEN_FAST_MATH,
201 HasCos = EIGEN_FAST_MATH,
212 HasTanh = EIGEN_FAST_MATH,
213 HasErf = EIGEN_FAST_MATH,
214 HasErfc = EIGEN_FAST_MATH,
221 struct packet_traits<int8_t> : default_packet_traits {
222 typedef Packet16c type;
223 typedef Packet8c half;
248 struct packet_traits<uint8_t> : default_packet_traits {
249 typedef Packet16uc type;
250 typedef Packet8uc half;
277 struct packet_traits<int16_t> : default_packet_traits {
278 typedef Packet8s type;
279 typedef Packet4s half;
304 struct packet_traits<uint16_t> : default_packet_traits {
305 typedef Packet8us type;
306 typedef Packet4us half;
332 struct packet_traits<int32_t> : default_packet_traits {
333 typedef Packet4i type;
334 typedef Packet2i half;
359 struct packet_traits<uint32_t> : default_packet_traits {
360 typedef Packet4ui type;
361 typedef Packet2ui half;
388 struct packet_traits<int64_t> : default_packet_traits {
389 typedef Packet2l type;
390 typedef Packet2l half;
415 struct packet_traits<uint64_t> : default_packet_traits {
416 typedef Packet2ul type;
417 typedef Packet2ul half;
441 template <
typename Packet,
typename Scalar>
442 struct neon_unpacket_default {
445 static constexpr
int size =
sizeof(Packet) /
sizeof(Scalar);
446 static constexpr
int alignment =
sizeof(Packet);
447 static constexpr
bool vectorizable =
true;
448 static constexpr
bool masked_load_available =
false;
449 static constexpr
bool masked_store_available =
false;
453 struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
454 using integer_packet = Packet2i;
457 struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
458 using half = Packet2f;
459 using integer_packet = Packet4i;
462 struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
464 struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
465 using half = Packet4c;
468 struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
469 using half = Packet8c;
472 struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
474 struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
475 using half = Packet4uc;
478 struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
479 using half = Packet8uc;
482 struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
484 struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
485 using half = Packet4s;
488 struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
490 struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
491 using half = Packet4us;
494 struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
496 struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
497 using half = Packet2i;
500 struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
502 struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
503 using half = Packet2ui;
506 struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
508 struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
511 EIGEN_STRONG_INLINE Packet2f pzero(
const Packet2f& ) {
512 return vdup_n_f32(0.0f);
516 EIGEN_STRONG_INLINE Packet4f pzero(
const Packet4f& ) {
517 return vdupq_n_f32(0.0f);
521 EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(
const float& from) {
522 return vdup_n_f32(from);
525 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
526 return vdupq_n_f32(from);
529 EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(
const int8_t& from) {
530 return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
533 EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(
const int8_t& from) {
534 return vdup_n_s8(from);
537 EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const int8_t& from) {
538 return vdupq_n_s8(from);
541 EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(
const uint8_t& from) {
542 return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
545 EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(
const uint8_t& from) {
546 return vdup_n_u8(from);
549 EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const uint8_t& from) {
550 return vdupq_n_u8(from);
553 EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(
const int16_t& from) {
554 return vdup_n_s16(from);
557 EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const int16_t& from) {
558 return vdupq_n_s16(from);
561 EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(
const uint16_t& from) {
562 return vdup_n_u16(from);
565 EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const uint16_t& from) {
566 return vdupq_n_u16(from);
569 EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(
const int32_t& from) {
570 return vdup_n_s32(from);
573 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
574 return vdupq_n_s32(from);
577 EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(
const uint32_t& from) {
578 return vdup_n_u32(from);
581 EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(
const uint32_t& from) {
582 return vdupq_n_u32(from);
585 EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
586 return vdupq_n_s64(from);
589 EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(
const uint64_t& from) {
590 return vdupq_n_u64(from);
594 EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
595 return vreinterpret_f32_u32(vdup_n_u32(from));
598 EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
599 return vreinterpretq_f32_u32(vdupq_n_u32(from));
603 EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(
const float& a) {
604 const float c[] = {0.0f, 1.0f};
605 return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
608 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
609 const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
610 return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
613 EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(
const int8_t& a) {
614 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
617 EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(
const int8_t& a) {
618 const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
619 return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
622 EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const int8_t& a) {
623 const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
624 return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
627 EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(
const uint8_t& a) {
628 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
631 EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(
const uint8_t& a) {
632 const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
633 return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
636 EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const uint8_t& a) {
637 const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
638 return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
641 EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(
const int16_t& a) {
642 const int16_t c[] = {0, 1, 2, 3};
643 return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
646 EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(
const uint16_t& a) {
647 const uint16_t c[] = {0, 1, 2, 3};
648 return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
651 EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const int16_t& a) {
652 const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
653 return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
656 EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const uint16_t& a) {
657 const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
658 return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
661 EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(
const int32_t& a) {
662 const int32_t c[] = {0, 1};
663 return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
666 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
667 const int32_t c[] = {0, 1, 2, 3};
668 return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
671 EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(
const uint32_t& a) {
672 const uint32_t c[] = {0, 1};
673 return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
676 EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(
const uint32_t& a) {
677 const uint32_t c[] = {0, 1, 2, 3};
678 return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
681 EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(
const int64_t& a) {
682 const int64_t c[] = {0, 1};
683 return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
686 EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(
const uint64_t& a) {
687 const uint64_t c[] = {0, 1};
688 return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
692 EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
693 return vadd_f32(a, b);
696 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
697 return vaddq_f32(a, b);
700 EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
701 return vget_lane_s32(
702 vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
705 EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
706 return vadd_s8(a, b);
709 EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
710 return vaddq_s8(a, b);
713 EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
714 return vget_lane_u32(
715 vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
718 EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
719 return vadd_u8(a, b);
722 EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
723 return vaddq_u8(a, b);
726 EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
727 return vadd_s16(a, b);
730 EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
731 return vaddq_s16(a, b);
734 EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
735 return vadd_u16(a, b);
738 EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
739 return vaddq_u16(a, b);
742 EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
743 return vadd_s32(a, b);
746 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
747 return vaddq_s32(a, b);
750 EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
751 return vadd_u32(a, b);
754 EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
755 return vaddq_u32(a, b);
758 EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
759 return vaddq_s64(a, b);
762 EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
763 return vaddq_u64(a, b);
767 EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
768 return vsub_f32(a, b);
771 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
772 return vsubq_f32(a, b);
775 EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
776 return vget_lane_s32(
777 vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
780 EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
781 return vsub_s8(a, b);
784 EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
785 return vsubq_s8(a, b);
788 EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
789 return vget_lane_u32(
790 vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
793 EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
794 return vsub_u8(a, b);
797 EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
798 return vsubq_u8(a, b);
801 EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
802 return vsub_s16(a, b);
805 EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
806 return vsubq_s16(a, b);
809 EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
810 return vsub_u16(a, b);
813 EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
814 return vsubq_u16(a, b);
817 EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
818 return vsub_s32(a, b);
821 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
822 return vsubq_s32(a, b);
825 EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
826 return vsub_u32(a, b);
829 EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
830 return vsubq_u32(a, b);
833 EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
834 return vsubq_s64(a, b);
837 EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
838 return vsubq_u64(a, b);
842 EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(
const Packet2f& a,
const Packet2f& b);
844 EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
845 Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
846 return padd(a, pxor(mask, b));
849 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b);
851 EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
852 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
853 return padd(a, pxor(mask, b));
857 EIGEN_STRONG_INLINE Packet2f pnegate(
const Packet2f& a) {
861 EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
865 EIGEN_STRONG_INLINE Packet4c pnegate(
const Packet4c& a) {
866 return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
869 EIGEN_STRONG_INLINE Packet8c pnegate(
const Packet8c& a) {
873 EIGEN_STRONG_INLINE Packet16c pnegate(
const Packet16c& a) {
877 EIGEN_STRONG_INLINE Packet4s pnegate(
const Packet4s& a) {
881 EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
885 EIGEN_STRONG_INLINE Packet2i pnegate(
const Packet2i& a) {
889 EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
893 EIGEN_STRONG_INLINE Packet2l pnegate(
const Packet2l& a) {
897 return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
902 EIGEN_STRONG_INLINE Packet2f pconj(
const Packet2f& a) {
906 EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
910 EIGEN_STRONG_INLINE Packet4c pconj(
const Packet4c& a) {
914 EIGEN_STRONG_INLINE Packet8c pconj(
const Packet8c& a) {
918 EIGEN_STRONG_INLINE Packet16c pconj(
const Packet16c& a) {
922 EIGEN_STRONG_INLINE Packet4uc pconj(
const Packet4uc& a) {
926 EIGEN_STRONG_INLINE Packet8uc pconj(
const Packet8uc& a) {
930 EIGEN_STRONG_INLINE Packet16uc pconj(
const Packet16uc& a) {
934 EIGEN_STRONG_INLINE Packet4s pconj(
const Packet4s& a) {
938 EIGEN_STRONG_INLINE Packet8s pconj(
const Packet8s& a) {
942 EIGEN_STRONG_INLINE Packet4us pconj(
const Packet4us& a) {
946 EIGEN_STRONG_INLINE Packet8us pconj(
const Packet8us& a) {
950 EIGEN_STRONG_INLINE Packet2i pconj(
const Packet2i& a) {
954 EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
958 EIGEN_STRONG_INLINE Packet2ui pconj(
const Packet2ui& a) {
962 EIGEN_STRONG_INLINE Packet4ui pconj(
const Packet4ui& a) {
966 EIGEN_STRONG_INLINE Packet2l pconj(
const Packet2l& a) {
970 EIGEN_STRONG_INLINE Packet2ul pconj(
const Packet2ul& a) {
975 EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
976 return vmul_f32(a, b);
979 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
980 return vmulq_f32(a, b);
983 EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
984 return vget_lane_s32(
985 vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
988 EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
989 return vmul_s8(a, b);
992 EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
993 return vmulq_s8(a, b);
996 EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
997 return vget_lane_u32(
998 vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1001 EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1002 return vmul_u8(a, b);
1005 EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1006 return vmulq_u8(a, b);
1009 EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1010 return vmul_s16(a, b);
1013 EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1014 return vmulq_s16(a, b);
1017 EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1018 return vmul_u16(a, b);
1021 EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1022 return vmulq_u16(a, b);
1025 EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1026 return vmul_s32(a, b);
1029 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1030 return vmulq_s32(a, b);
1033 EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1034 return vmul_u32(a, b);
1037 EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1038 return vmulq_u32(a, b);
1041 EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1042 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
1043 vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
1046 EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1047 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
1048 vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
1052 EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(
const Packet4c& ,
const Packet4c& ) {
1053 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1054 return pset1<Packet4c>(0);
1057 EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(
const Packet8c& ,
const Packet8c& ) {
1058 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1059 return pset1<Packet8c>(0);
1062 EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(
const Packet16c& ,
const Packet16c& ) {
1063 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1064 return pset1<Packet16c>(0);
1067 EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(
const Packet4uc& ,
const Packet4uc& ) {
1068 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1069 return pset1<Packet4uc>(0);
1072 EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(
const Packet8uc& ,
const Packet8uc& ) {
1073 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1074 return pset1<Packet8uc>(0);
1077 EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(
const Packet16uc& ,
const Packet16uc& ) {
1078 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1079 return pset1<Packet16uc>(0);
1082 EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(
const Packet4s& ,
const Packet4s& ) {
1083 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1084 return pset1<Packet4s>(0);
1087 EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(
const Packet8s& ,
const Packet8s& ) {
1088 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1089 return pset1<Packet8s>(0);
1092 EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(
const Packet4us& ,
const Packet4us& ) {
1093 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1094 return pset1<Packet4us>(0);
1097 EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(
const Packet8us& ,
const Packet8us& ) {
1098 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1099 return pset1<Packet8us>(0);
1102 EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(
const Packet2i& ,
const Packet2i& ) {
1103 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1104 return pset1<Packet2i>(0);
1107 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& ,
const Packet4i& ) {
1108 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1109 return pset1<Packet4i>(0);
1112 EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(
const Packet2ui& ,
const Packet2ui& ) {
1113 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1114 return pset1<Packet2ui>(0);
1117 EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(
const Packet4ui& ,
const Packet4ui& ) {
1118 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1119 return pset1<Packet4ui>(0);
1122 EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(
const Packet2l& ,
const Packet2l& ) {
1123 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1124 return pset1<Packet2l>(0LL);
1127 EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(
const Packet2ul& ,
const Packet2ul& ) {
1128 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1129 return pset1<Packet2ul>(0ULL);
1132 #ifdef EIGEN_VECTORIZE_FMA 1134 EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1135 return vfmaq_f32(c, a, b);
1138 EIGEN_STRONG_INLINE Packet2f pmadd(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1139 return vfma_f32(c, a, b);
1142 EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1143 return vfmsq_f32(c, a, b);
1146 EIGEN_STRONG_INLINE Packet2f pnmadd(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1147 return vfms_f32(c, a, b);
1151 EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1152 return vmlaq_f32(c, a, b);
1155 EIGEN_STRONG_INLINE Packet2f pmadd(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1156 return vmla_f32(c, a, b);
1159 EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1160 return vmlsq_f32(c, a, b);
1163 EIGEN_STRONG_INLINE Packet2f pnmadd(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1164 return vmls_f32(c, a, b);
1168 EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1169 return pnegate(pnmadd(a, b, c));
1172 EIGEN_STRONG_INLINE Packet2f pmsub(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1173 return pnegate(pnmadd(a, b, c));
1176 EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1177 return pnegate(pmadd(a, b, c));
1180 EIGEN_STRONG_INLINE Packet2f pnmsub(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1181 return pnegate(pmadd(a, b, c));
1186 EIGEN_STRONG_INLINE Packet4c pmadd(
const Packet4c& a,
const Packet4c& b,
const Packet4c& c) {
1187 return vget_lane_s32(
1188 vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
1189 vreinterpret_s8_s32(vdup_n_s32(b)))),
1193 EIGEN_STRONG_INLINE Packet8c pmadd(
const Packet8c& a,
const Packet8c& b,
const Packet8c& c) {
1194 return vmla_s8(c, a, b);
1197 EIGEN_STRONG_INLINE Packet16c pmadd(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
1198 return vmlaq_s8(c, a, b);
1201 EIGEN_STRONG_INLINE Packet4uc pmadd(
const Packet4uc& a,
const Packet4uc& b,
const Packet4uc& c) {
1202 return vget_lane_u32(
1203 vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
1204 vreinterpret_u8_u32(vdup_n_u32(b)))),
1208 EIGEN_STRONG_INLINE Packet8uc pmadd(
const Packet8uc& a,
const Packet8uc& b,
const Packet8uc& c) {
1209 return vmla_u8(c, a, b);
1212 EIGEN_STRONG_INLINE Packet16uc pmadd(
const Packet16uc& a,
const Packet16uc& b,
const Packet16uc& c) {
1213 return vmlaq_u8(c, a, b);
1216 EIGEN_STRONG_INLINE Packet4s pmadd(
const Packet4s& a,
const Packet4s& b,
const Packet4s& c) {
1217 return vmla_s16(c, a, b);
1220 EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
1221 return vmlaq_s16(c, a, b);
1224 EIGEN_STRONG_INLINE Packet4us pmadd(
const Packet4us& a,
const Packet4us& b,
const Packet4us& c) {
1225 return vmla_u16(c, a, b);
1228 EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c) {
1229 return vmlaq_u16(c, a, b);
1232 EIGEN_STRONG_INLINE Packet2i pmadd(
const Packet2i& a,
const Packet2i& b,
const Packet2i& c) {
1233 return vmla_s32(c, a, b);
1236 EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
1237 return vmlaq_s32(c, a, b);
1240 EIGEN_STRONG_INLINE Packet2ui pmadd(
const Packet2ui& a,
const Packet2ui& b,
const Packet2ui& c) {
1241 return vmla_u32(c, a, b);
1244 EIGEN_STRONG_INLINE Packet4ui pmadd(
const Packet4ui& a,
const Packet4ui& b,
const Packet4ui& c) {
1245 return vmlaq_u32(c, a, b);
1249 EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1250 return vabd_f32(a, b);
1253 EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1254 return vabdq_f32(a, b);
1257 EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1258 return vget_lane_s32(
1259 vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1262 EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1263 return vabd_s8(a, b);
1266 EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1267 return vabdq_s8(a, b);
1270 EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1271 return vget_lane_u32(
1272 vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1275 EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1276 return vabd_u8(a, b);
1279 EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1280 return vabdq_u8(a, b);
1283 EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1284 return vabd_s16(a, b);
1287 EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1288 return vabdq_s16(a, b);
1291 EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1292 return vabd_u16(a, b);
1295 EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1296 return vabdq_u16(a, b);
1299 EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1300 return vabd_s32(a, b);
1303 EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1304 return vabdq_s32(a, b);
1307 EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1308 return vabd_u32(a, b);
1311 EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1312 return vabdq_u32(a, b);
1316 EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1317 return vmin_f32(a, b);
1320 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1321 return vminq_f32(a, b);
1324 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN 1328 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1329 return vminnmq_f32(a, b);
1332 EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1333 return vminnm_f32(a, b);
1338 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1339 return pmin<Packet4f>(a, b);
1343 EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1344 return pmin<Packet2f>(a, b);
1348 EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1349 return vget_lane_s32(
1350 vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1353 EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1354 return vmin_s8(a, b);
1357 EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1358 return vminq_s8(a, b);
1361 EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1362 return vget_lane_u32(
1363 vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1366 EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1367 return vmin_u8(a, b);
1370 EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1371 return vminq_u8(a, b);
1374 EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1375 return vmin_s16(a, b);
1378 EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1379 return vminq_s16(a, b);
1382 EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1383 return vmin_u16(a, b);
1386 EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1387 return vminq_u16(a, b);
1390 EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1391 return vmin_s32(a, b);
1394 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1395 return vminq_s32(a, b);
1398 EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1399 return vmin_u32(a, b);
1402 EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1403 return vminq_u32(a, b);
1406 EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1407 return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1408 vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1411 EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1412 return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1413 vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1417 EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1418 return vmax_f32(a, b);
1421 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1422 return vmaxq_f32(a, b);
1425 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN 1429 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1430 return vmaxnmq_f32(a, b);
1433 EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1434 return vmaxnm_f32(a, b);
1439 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1440 return pmax<Packet4f>(a, b);
1444 EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1445 return pmax<Packet2f>(a, b);
1449 EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1450 return vget_lane_s32(
1451 vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1454 EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1455 return vmax_s8(a, b);
1458 EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1459 return vmaxq_s8(a, b);
1462 EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1463 return vget_lane_u32(
1464 vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1467 EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1468 return vmax_u8(a, b);
1471 EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1472 return vmaxq_u8(a, b);
1475 EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1476 return vmax_s16(a, b);
1479 EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1480 return vmaxq_s16(a, b);
1483 EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1484 return vmax_u16(a, b);
1487 EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1488 return vmaxq_u16(a, b);
1491 EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1492 return vmax_s32(a, b);
1495 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1496 return vmaxq_s32(a, b);
1499 EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1500 return vmax_u32(a, b);
1503 EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1504 return vmaxq_u32(a, b);
1507 EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1508 return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1509 vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1512 EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1513 return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1514 vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1518 EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1519 return vreinterpret_f32_u32(vcle_f32(a, b));
1522 EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1523 return vreinterpretq_f32_u32(vcleq_f32(a, b));
1526 EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1527 return vget_lane_s32(
1528 vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1531 EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1532 return vreinterpret_s8_u8(vcle_s8(a, b));
1535 EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1536 return vreinterpretq_s8_u8(vcleq_s8(a, b));
1539 EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1540 return vget_lane_u32(
1541 vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1544 EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1545 return vcle_u8(a, b);
1548 EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1549 return vcleq_u8(a, b);
1552 EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1553 return vreinterpret_s16_u16(vcle_s16(a, b));
1556 EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1557 return vreinterpretq_s16_u16(vcleq_s16(a, b));
1560 EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1561 return vcle_u16(a, b);
1564 EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1565 return vcleq_u16(a, b);
1568 EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1569 return vreinterpret_s32_u32(vcle_s32(a, b));
1572 EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1573 return vreinterpretq_s32_u32(vcleq_s32(a, b));
1576 EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1577 return vcle_u32(a, b);
1580 EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1581 return vcleq_u32(a, b);
1584 EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1585 #if EIGEN_ARCH_ARM64 1586 return vreinterpretq_s64_u64(vcleq_s64(a, b));
1588 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1589 vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1593 EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1594 #if EIGEN_ARCH_ARM64 1595 return vcleq_u64(a, b);
1597 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1598 vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1603 EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1604 return vreinterpret_f32_u32(vclt_f32(a, b));
1607 EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1608 return vreinterpretq_f32_u32(vcltq_f32(a, b));
1611 EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1612 return vget_lane_s32(
1613 vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1616 EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1617 return vreinterpret_s8_u8(vclt_s8(a, b));
1620 EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1621 return vreinterpretq_s8_u8(vcltq_s8(a, b));
1624 EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1625 return vget_lane_u32(
1626 vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1629 EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1630 return vclt_u8(a, b);
1633 EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1634 return vcltq_u8(a, b);
1637 EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1638 return vreinterpret_s16_u16(vclt_s16(a, b));
1641 EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1642 return vreinterpretq_s16_u16(vcltq_s16(a, b));
1645 EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1646 return vclt_u16(a, b);
1649 EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1650 return vcltq_u16(a, b);
1653 EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1654 return vreinterpret_s32_u32(vclt_s32(a, b));
1657 EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1658 return vreinterpretq_s32_u32(vcltq_s32(a, b));
1661 EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1662 return vclt_u32(a, b);
1665 EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1666 return vcltq_u32(a, b);
1669 EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1670 #if EIGEN_ARCH_ARM64 1671 return vreinterpretq_s64_u64(vcltq_s64(a, b));
1673 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1674 vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1678 EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1679 #if EIGEN_ARCH_ARM64 1680 return vcltq_u64(a, b);
1682 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1683 vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1688 EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1689 return vreinterpret_f32_u32(vceq_f32(a, b));
1692 EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1693 return vreinterpretq_f32_u32(vceqq_f32(a, b));
1696 EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1697 return vget_lane_s32(
1698 vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1701 EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1702 return vreinterpret_s8_u8(vceq_s8(a, b));
1705 EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1706 return vreinterpretq_s8_u8(vceqq_s8(a, b));
1709 EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1710 return vget_lane_u32(
1711 vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1714 EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1715 return vceq_u8(a, b);
1718 EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1719 return vceqq_u8(a, b);
1722 EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1723 return vreinterpret_s16_u16(vceq_s16(a, b));
1726 EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1727 return vreinterpretq_s16_u16(vceqq_s16(a, b));
1730 EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1731 return vceq_u16(a, b);
1734 EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1735 return vceqq_u16(a, b);
1738 EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1739 return vreinterpret_s32_u32(vceq_s32(a, b));
1742 EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1743 return vreinterpretq_s32_u32(vceqq_s32(a, b));
1746 EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1747 return vceq_u32(a, b);
1750 EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1751 return vceqq_u32(a, b);
1754 EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1755 #if EIGEN_ARCH_ARM64 1756 return vreinterpretq_s64_u64(vceqq_s64(a, b));
1758 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1759 vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1763 EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1764 #if EIGEN_ARCH_ARM64 1765 return vceqq_u64(a, b);
1767 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1768 vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1773 EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1774 return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
1777 EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1778 return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
1783 EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1784 return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1787 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1788 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1791 EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1795 EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1796 return vand_s8(a, b);
1799 EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1800 return vandq_s8(a, b);
1803 EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1807 EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1808 return vand_u8(a, b);
1811 EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1812 return vandq_u8(a, b);
1815 EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1816 return vand_s16(a, b);
1819 EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1820 return vandq_s16(a, b);
1823 EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1824 return vand_u16(a, b);
1827 EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1828 return vandq_u16(a, b);
1831 EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1832 return vand_s32(a, b);
1835 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1836 return vandq_s32(a, b);
1839 EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1840 return vand_u32(a, b);
1843 EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1844 return vandq_u32(a, b);
1847 EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1848 return vandq_s64(a, b);
1851 EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1852 return vandq_u64(a, b);
1856 EIGEN_STRONG_INLINE Packet2f por<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1857 return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1860 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1861 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1864 EIGEN_STRONG_INLINE Packet4c por<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1868 EIGEN_STRONG_INLINE Packet8c por<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1869 return vorr_s8(a, b);
1872 EIGEN_STRONG_INLINE Packet16c por<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1873 return vorrq_s8(a, b);
1876 EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1880 EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1881 return vorr_u8(a, b);
1884 EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1885 return vorrq_u8(a, b);
1888 EIGEN_STRONG_INLINE Packet4s por<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1889 return vorr_s16(a, b);
1892 EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1893 return vorrq_s16(a, b);
1896 EIGEN_STRONG_INLINE Packet4us por<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1897 return vorr_u16(a, b);
1900 EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1901 return vorrq_u16(a, b);
1904 EIGEN_STRONG_INLINE Packet2i por<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1905 return vorr_s32(a, b);
1908 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1909 return vorrq_s32(a, b);
1912 EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1913 return vorr_u32(a, b);
1916 EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1917 return vorrq_u32(a, b);
1920 EIGEN_STRONG_INLINE Packet2l por<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1921 return vorrq_s64(a, b);
1924 EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1925 return vorrq_u64(a, b);
1929 EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
1930 return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1933 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1934 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1937 EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
1941 EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
1942 return veor_s8(a, b);
1945 EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1946 return veorq_s8(a, b);
1949 EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
1953 EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
1954 return veor_u8(a, b);
1957 EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1958 return veorq_u8(a, b);
1961 EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
1962 return veor_s16(a, b);
1965 EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1966 return veorq_s16(a, b);
1969 EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
1970 return veor_u16(a, b);
1973 EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1974 return veorq_u16(a, b);
1977 EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
1978 return veor_s32(a, b);
1981 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1982 return veorq_s32(a, b);
1985 EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
1986 return veor_u32(a, b);
1989 EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1990 return veorq_u32(a, b);
1993 EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1994 return veorq_s64(a, b);
1997 EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1998 return veorq_u64(a, b);
2002 EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
2003 return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
2006 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2007 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
2010 EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(
const Packet4c& a,
const Packet4c& b) {
2014 EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
2015 return vbic_s8(a, b);
2018 EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
2019 return vbicq_s8(a, b);
2022 EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b) {
2026 EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
2027 return vbic_u8(a, b);
2030 EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
2031 return vbicq_u8(a, b);
2034 EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
2035 return vbic_s16(a, b);
2038 EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
2039 return vbicq_s16(a, b);
2042 EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
2043 return vbic_u16(a, b);
2046 EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
2047 return vbicq_u16(a, b);
2050 EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
2051 return vbic_s32(a, b);
2054 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
2055 return vbicq_s32(a, b);
2058 EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
2059 return vbic_u32(a, b);
2062 EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
2063 return vbicq_u32(a, b);
2066 EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
2067 return vbicq_s64(a, b);
2070 EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
2071 return vbicq_u64(a, b);
2075 EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
2076 return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2079 EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
2080 return vshr_n_s8(a, N);
2083 EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
2084 return vshrq_n_s8(a, N);
2087 EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
2088 return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2091 EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
2092 return vshr_n_u8(a, N);
2095 EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
2096 return vshrq_n_u8(a, N);
2099 EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
2100 return vshr_n_s16(a, N);
2103 EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
2104 return vshrq_n_s16(a, N);
2107 EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
2108 return vshr_n_u16(a, N);
2111 EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
2112 return vshrq_n_u16(a, N);
2115 EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
2116 return vshr_n_s32(a, N);
2119 EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
2120 return vshrq_n_s32(a, N);
2123 EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
2124 return vshr_n_u32(a, N);
2127 EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
2128 return vshrq_n_u32(a, N);
2131 EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
2132 return vshrq_n_s64(a, N);
2135 EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
2136 return vshrq_n_u64(a, N);
2140 EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
2141 return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
2144 EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
2145 return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
2148 EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
2149 return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
2152 EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
2153 return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
2156 EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
2157 return vshr_n_u8(a, N);
2160 EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
2161 return vshrq_n_u8(a, N);
2164 EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
2165 return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
2168 EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
2169 return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
2172 EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
2173 return vshr_n_u16(a, N);
2176 EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
2177 return vshrq_n_u16(a, N);
2180 EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
2181 return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
2184 EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
2185 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
2188 EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
2189 return vshr_n_u32(a, N);
2192 EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
2193 return vshrq_n_u32(a, N);
2196 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
2197 return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
2200 EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
2201 return vshrq_n_u64(a, N);
2205 EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
2206 return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2209 EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
2210 return vshl_n_s8(a, N);
2213 EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
2214 return vshlq_n_s8(a, N);
2217 EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
2218 return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2221 EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
2222 return vshl_n_u8(a, N);
2225 EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
2226 return vshlq_n_u8(a, N);
2229 EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
2230 return vshl_n_s16(a, N);
2233 EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
2234 return vshlq_n_s16(a, N);
2237 EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
2238 return vshl_n_u16(a, N);
2241 EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
2242 return vshlq_n_u16(a, N);
2245 EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
2246 return vshl_n_s32(a, N);
2249 EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
2250 return vshlq_n_s32(a, N);
2253 EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
2254 return vshl_n_u32(a, N);
2257 EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
2258 return vshlq_n_u32(a, N);
2261 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
2262 return vshlq_n_s64(a, N);
2265 EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
2266 return vshlq_n_u64(a, N);
2270 EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(
const float* from) {
2271 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(from));
2274 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
2275 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(from));
2278 EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(
const int8_t* from) {
2280 memcpy(&res, from,
sizeof(Packet4c));
2284 EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(
const int8_t* from) {
2285 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(from));
2288 EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const int8_t* from) {
2289 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(from));
2292 EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(
const uint8_t* from) {
2294 memcpy(&res, from,
sizeof(Packet4uc));
2298 EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(
const uint8_t* from) {
2299 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(from));
2302 EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const uint8_t* from) {
2303 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(from));
2306 EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(
const int16_t* from) {
2307 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(from));
2310 EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const int16_t* from) {
2311 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(from));
2314 EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(
const uint16_t* from) {
2315 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(from));
2318 EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const uint16_t* from) {
2319 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(from));
2322 EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(
const int32_t* from) {
2323 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(from));
2326 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
2327 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(from));
2330 EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(
const uint32_t* from) {
2331 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(from));
2334 EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(
const uint32_t* from) {
2335 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(from));
2338 EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(
const int64_t* from) {
2339 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(from));
2342 EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(
const uint64_t* from) {
2343 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(from));
2347 EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(
const float* from) {
2348 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_f32(from);
2351 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
2352 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f32(from);
2355 EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(
const int8_t* from) {
2357 memcpy(&res, from,
sizeof(Packet4c));
2361 EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(
const int8_t* from) {
2362 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_s8(from);
2365 EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const int8_t* from) {
2366 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s8(from);
2369 EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(
const uint8_t* from) {
2371 memcpy(&res, from,
sizeof(Packet4uc));
2375 EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(
const uint8_t* from) {
2376 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_u8(from);
2379 EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const uint8_t* from) {
2380 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u8(from);
2383 EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(
const int16_t* from) {
2384 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_s16(from);
2387 EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const int16_t* from) {
2388 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s16(from);
2391 EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(
const uint16_t* from) {
2392 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_u16(from);
2395 EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const uint16_t* from) {
2396 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u16(from);
2399 EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(
const int32_t* from) {
2400 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_s32(from);
2403 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
2404 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s32(from);
2407 EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(
const uint32_t* from) {
2408 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_u32(from);
2411 EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(
const uint32_t* from) {
2412 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u32(from);
2415 EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(
const int64_t* from) {
2416 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s64(from);
2419 EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(
const uint64_t* from) {
2420 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u64(from);
2424 EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(
const float* from) {
2425 return vld1_dup_f32(from);
2428 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
2429 return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
2432 EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(
const int8_t* from) {
2433 const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
2434 return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
2437 EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(
const int8_t* from) {
2438 const int8x8_t a = vld1_s8(from);
2439 return vzip_s8(a, a).val[0];
2442 EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const int8_t* from) {
2443 const int8x8_t a = vld1_s8(from);
2444 const int8x8x2_t b = vzip_s8(a, a);
2445 return vcombine_s8(b.val[0], b.val[1]);
2448 EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(
const uint8_t* from) {
2449 const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
2450 return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
2453 EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(
const uint8_t* from) {
2454 const uint8x8_t a = vld1_u8(from);
2455 return vzip_u8(a, a).val[0];
2458 EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const uint8_t* from) {
2459 const uint8x8_t a = vld1_u8(from);
2460 const uint8x8x2_t b = vzip_u8(a, a);
2461 return vcombine_u8(b.val[0], b.val[1]);
2464 EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(
const int16_t* from) {
2465 return vreinterpret_s16_u32(
2466 vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
2469 EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const int16_t* from) {
2470 const int16x4_t a = vld1_s16(from);
2471 const int16x4x2_t b = vzip_s16(a, a);
2472 return vcombine_s16(b.val[0], b.val[1]);
2475 EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(
const uint16_t* from) {
2476 return vreinterpret_u16_u32(
2477 vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
2480 EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const uint16_t* from) {
2481 const uint16x4_t a = vld1_u16(from);
2482 const uint16x4x2_t b = vzip_u16(a, a);
2483 return vcombine_u16(b.val[0], b.val[1]);
2486 EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(
const int32_t* from) {
2487 return vld1_dup_s32(from);
2490 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
2491 return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
2494 EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(
const uint32_t* from) {
2495 return vld1_dup_u32(from);
2498 EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(
const uint32_t* from) {
2499 return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
2502 EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(
const int64_t* from) {
2503 return vld1q_dup_s64(from);
2506 EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(
const uint64_t* from) {
2507 return vld1q_dup_u64(from);
2511 EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(
const float* from) {
2512 return vld1q_dup_f32(from);
2515 EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(
const int8_t* from) {
2516 return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
2519 EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(
const int8_t* from) {
2520 return vreinterpret_s8_u32(
2521 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2524 EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(
const int8_t* from) {
2525 const int8x8_t a = vreinterpret_s8_u32(
2526 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2527 const int8x8_t b = vreinterpret_s8_u32(
2528 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
2529 return vcombine_s8(a, b);
2532 EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(
const uint8_t* from) {
2533 return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
2536 EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(
const uint8_t* from) {
2537 return vreinterpret_u8_u32(
2538 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2541 EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(
const uint8_t* from) {
2542 const uint8x8_t a = vreinterpret_u8_u32(
2543 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2544 const uint8x8_t b = vreinterpret_u8_u32(
2545 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
2546 return vcombine_u8(a, b);
2549 EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const int16_t* from) {
2550 return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
2553 EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const uint16_t* from) {
2554 return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
2557 EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(
const int32_t* from) {
2558 return vld1q_dup_s32(from);
2561 EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(
const uint32_t* from) {
2562 return vld1q_dup_u32(from);
2566 EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet2f& from) {
2567 EIGEN_DEBUG_ALIGNED_STORE vst1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(to), from);
2570 EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
2571 EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(to), from);
2574 EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet4c& from) {
2575 memcpy(to, &from,
sizeof(from));
2578 EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet8c& from) {
2579 EIGEN_DEBUG_ALIGNED_STORE vst1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(to), from);
2582 EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet16c& from) {
2583 EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(to), from);
2586 EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet4uc& from) {
2587 memcpy(to, &from,
sizeof(from));
2590 EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet8uc& from) {
2591 EIGEN_DEBUG_ALIGNED_STORE vst1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(to), from);
2594 EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet16uc& from) {
2595 EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(to), from);
2598 EIGEN_STRONG_INLINE
void pstore<int16_t>(int16_t* to,
const Packet4s& from) {
2599 EIGEN_DEBUG_ALIGNED_STORE vst1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(to), from);
2602 EIGEN_STRONG_INLINE
void pstore<int16_t>(int16_t* to,
const Packet8s& from) {
2603 EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(to), from);
2606 EIGEN_STRONG_INLINE
void pstore<uint16_t>(uint16_t* to,
const Packet4us& from) {
2607 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(to), from);
2610 EIGEN_STRONG_INLINE
void pstore<uint16_t>(uint16_t* to,
const Packet8us& from) {
2611 EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(to), from);
2614 EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet2i& from) {
2615 EIGEN_DEBUG_ALIGNED_STORE vst1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(to), from);
2618 EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
2619 EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(to), from);
2622 EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet2ui& from) {
2623 EIGEN_DEBUG_ALIGNED_STORE vst1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(to), from);
2626 EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet4ui& from) {
2627 EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(to), from);
2630 EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet2l& from) {
2631 EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(to), from);
2634 EIGEN_STRONG_INLINE
void pstore<uint64_t>(uint64_t* to,
const Packet2ul& from) {
2635 EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(to), from);
2639 EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet2f& from) {
2640 EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
2643 EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
2644 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
2647 EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet4c& from) {
2648 memcpy(to, &from,
sizeof(from));
2651 EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet8c& from) {
2652 EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
2655 EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet16c& from) {
2656 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
2659 EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet4uc& from) {
2660 memcpy(to, &from,
sizeof(from));
2663 EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet8uc& from) {
2664 EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
2667 EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet16uc& from) {
2668 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
2671 EIGEN_STRONG_INLINE
void pstoreu<int16_t>(int16_t* to,
const Packet4s& from) {
2672 EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
2675 EIGEN_STRONG_INLINE
void pstoreu<int16_t>(int16_t* to,
const Packet8s& from) {
2676 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
2679 EIGEN_STRONG_INLINE
void pstoreu<uint16_t>(uint16_t* to,
const Packet4us& from) {
2680 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
2683 EIGEN_STRONG_INLINE
void pstoreu<uint16_t>(uint16_t* to,
const Packet8us& from) {
2684 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
2687 EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet2i& from) {
2688 EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
2691 EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
2692 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
2695 EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet2ui& from) {
2696 EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
2699 EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet4ui& from) {
2700 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
2703 EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet2l& from) {
2704 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
2707 EIGEN_STRONG_INLINE
void pstoreu<uint64_t>(uint64_t* to,
const Packet2ul& from) {
2708 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
2712 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(
const float* from,
Index stride) {
2713 Packet2f res = vld1_dup_f32(from);
2714 res = vld1_lane_f32(from + 1 * stride, res, 1);
2718 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
2719 Packet4f res = vld1q_dup_f32(from);
2720 res = vld1q_lane_f32(from + 1 * stride, res, 1);
2721 res = vld1q_lane_f32(from + 2 * stride, res, 2);
2722 res = vld1q_lane_f32(from + 3 * stride, res, 3);
2726 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(
const int8_t* from,
Index stride) {
2728 for (
int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
2732 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(
const int8_t* from,
Index stride) {
2733 Packet8c res = vld1_dup_s8(from);
2734 res = vld1_lane_s8(from + 1 * stride, res, 1);
2735 res = vld1_lane_s8(from + 2 * stride, res, 2);
2736 res = vld1_lane_s8(from + 3 * stride, res, 3);
2737 res = vld1_lane_s8(from + 4 * stride, res, 4);
2738 res = vld1_lane_s8(from + 5 * stride, res, 5);
2739 res = vld1_lane_s8(from + 6 * stride, res, 6);
2740 res = vld1_lane_s8(from + 7 * stride, res, 7);
2744 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(
const int8_t* from,
Index stride) {
2745 Packet16c res = vld1q_dup_s8(from);
2746 res = vld1q_lane_s8(from + 1 * stride, res, 1);
2747 res = vld1q_lane_s8(from + 2 * stride, res, 2);
2748 res = vld1q_lane_s8(from + 3 * stride, res, 3);
2749 res = vld1q_lane_s8(from + 4 * stride, res, 4);
2750 res = vld1q_lane_s8(from + 5 * stride, res, 5);
2751 res = vld1q_lane_s8(from + 6 * stride, res, 6);
2752 res = vld1q_lane_s8(from + 7 * stride, res, 7);
2753 res = vld1q_lane_s8(from + 8 * stride, res, 8);
2754 res = vld1q_lane_s8(from + 9 * stride, res, 9);
2755 res = vld1q_lane_s8(from + 10 * stride, res, 10);
2756 res = vld1q_lane_s8(from + 11 * stride, res, 11);
2757 res = vld1q_lane_s8(from + 12 * stride, res, 12);
2758 res = vld1q_lane_s8(from + 13 * stride, res, 13);
2759 res = vld1q_lane_s8(from + 14 * stride, res, 14);
2760 res = vld1q_lane_s8(from + 15 * stride, res, 15);
2764 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(
const uint8_t* from,
Index stride) {
2766 for (
int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
2770 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(
const uint8_t* from,
Index stride) {
2771 Packet8uc res = vld1_dup_u8(from);
2772 res = vld1_lane_u8(from + 1 * stride, res, 1);
2773 res = vld1_lane_u8(from + 2 * stride, res, 2);
2774 res = vld1_lane_u8(from + 3 * stride, res, 3);
2775 res = vld1_lane_u8(from + 4 * stride, res, 4);
2776 res = vld1_lane_u8(from + 5 * stride, res, 5);
2777 res = vld1_lane_u8(from + 6 * stride, res, 6);
2778 res = vld1_lane_u8(from + 7 * stride, res, 7);
2782 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(
const uint8_t* from,
Index stride) {
2783 Packet16uc res = vld1q_dup_u8(from);
2784 res = vld1q_lane_u8(from + 1 * stride, res, 1);
2785 res = vld1q_lane_u8(from + 2 * stride, res, 2);
2786 res = vld1q_lane_u8(from + 3 * stride, res, 3);
2787 res = vld1q_lane_u8(from + 4 * stride, res, 4);
2788 res = vld1q_lane_u8(from + 5 * stride, res, 5);
2789 res = vld1q_lane_u8(from + 6 * stride, res, 6);
2790 res = vld1q_lane_u8(from + 7 * stride, res, 7);
2791 res = vld1q_lane_u8(from + 8 * stride, res, 8);
2792 res = vld1q_lane_u8(from + 9 * stride, res, 9);
2793 res = vld1q_lane_u8(from + 10 * stride, res, 10);
2794 res = vld1q_lane_u8(from + 11 * stride, res, 11);
2795 res = vld1q_lane_u8(from + 12 * stride, res, 12);
2796 res = vld1q_lane_u8(from + 13 * stride, res, 13);
2797 res = vld1q_lane_u8(from + 14 * stride, res, 14);
2798 res = vld1q_lane_u8(from + 15 * stride, res, 15);
2802 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(
const int16_t* from,
Index stride) {
2803 Packet4s res = vld1_dup_s16(from);
2804 res = vld1_lane_s16(from + 1 * stride, res, 1);
2805 res = vld1_lane_s16(from + 2 * stride, res, 2);
2806 res = vld1_lane_s16(from + 3 * stride, res, 3);
2810 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(
const int16_t* from,
Index stride) {
2811 Packet8s res = vld1q_dup_s16(from);
2812 res = vld1q_lane_s16(from + 1 * stride, res, 1);
2813 res = vld1q_lane_s16(from + 2 * stride, res, 2);
2814 res = vld1q_lane_s16(from + 3 * stride, res, 3);
2815 res = vld1q_lane_s16(from + 4 * stride, res, 4);
2816 res = vld1q_lane_s16(from + 5 * stride, res, 5);
2817 res = vld1q_lane_s16(from + 6 * stride, res, 6);
2818 res = vld1q_lane_s16(from + 7 * stride, res, 7);
2822 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(
const uint16_t* from,
Index stride) {
2823 Packet4us res = vld1_dup_u16(from);
2824 res = vld1_lane_u16(from + 1 * stride, res, 1);
2825 res = vld1_lane_u16(from + 2 * stride, res, 2);
2826 res = vld1_lane_u16(from + 3 * stride, res, 3);
2830 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(
const uint16_t* from,
Index stride) {
2831 Packet8us res = vld1q_dup_u16(from);
2832 res = vld1q_lane_u16(from + 1 * stride, res, 1);
2833 res = vld1q_lane_u16(from + 2 * stride, res, 2);
2834 res = vld1q_lane_u16(from + 3 * stride, res, 3);
2835 res = vld1q_lane_u16(from + 4 * stride, res, 4);
2836 res = vld1q_lane_u16(from + 5 * stride, res, 5);
2837 res = vld1q_lane_u16(from + 6 * stride, res, 6);
2838 res = vld1q_lane_u16(from + 7 * stride, res, 7);
2842 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(
const int32_t* from,
Index stride) {
2843 Packet2i res = vld1_dup_s32(from);
2844 res = vld1_lane_s32(from + 1 * stride, res, 1);
2848 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride) {
2849 Packet4i res = vld1q_dup_s32(from);
2850 res = vld1q_lane_s32(from + 1 * stride, res, 1);
2851 res = vld1q_lane_s32(from + 2 * stride, res, 2);
2852 res = vld1q_lane_s32(from + 3 * stride, res, 3);
2856 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(
const uint32_t* from,
Index stride) {
2857 Packet2ui res = vld1_dup_u32(from);
2858 res = vld1_lane_u32(from + 1 * stride, res, 1);
2862 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(
const uint32_t* from,
Index stride) {
2863 Packet4ui res = vld1q_dup_u32(from);
2864 res = vld1q_lane_u32(from + 1 * stride, res, 1);
2865 res = vld1q_lane_u32(from + 2 * stride, res, 2);
2866 res = vld1q_lane_u32(from + 3 * stride, res, 3);
2870 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(
const int64_t* from,
Index stride) {
2871 Packet2l res = vld1q_dup_s64(from);
2872 res = vld1q_lane_s64(from + 1 * stride, res, 1);
2876 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(
const uint64_t* from,
Index stride) {
2877 Packet2ul res = vld1q_dup_u64(from);
2878 res = vld1q_lane_u64(from + 1 * stride, res, 1);
2883 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<float, Packet2f>(
float* to,
const Packet2f& from,
Index stride) {
2884 vst1_lane_f32(to + stride * 0, from, 0);
2885 vst1_lane_f32(to + stride * 1, from, 1);
2888 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
2889 vst1q_lane_f32(to + stride * 0, from, 0);
2890 vst1q_lane_f32(to + stride * 1, from, 1);
2891 vst1q_lane_f32(to + stride * 2, from, 2);
2892 vst1q_lane_f32(to + stride * 3, from, 3);
2895 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet4c>(int8_t* to,
const Packet4c& from,
Index stride) {
2896 for (
int i = 0; i != 4; i++) *(to + i * stride) =
reinterpret_cast<const int8_t*
>(&from)[i];
2899 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet8c>(int8_t* to,
const Packet8c& from,
Index stride) {
2900 vst1_lane_s8(to + stride * 0, from, 0);
2901 vst1_lane_s8(to + stride * 1, from, 1);
2902 vst1_lane_s8(to + stride * 2, from, 2);
2903 vst1_lane_s8(to + stride * 3, from, 3);
2904 vst1_lane_s8(to + stride * 4, from, 4);
2905 vst1_lane_s8(to + stride * 5, from, 5);
2906 vst1_lane_s8(to + stride * 6, from, 6);
2907 vst1_lane_s8(to + stride * 7, from, 7);
2910 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet16c>(int8_t* to,
const Packet16c& from,
2912 vst1q_lane_s8(to + stride * 0, from, 0);
2913 vst1q_lane_s8(to + stride * 1, from, 1);
2914 vst1q_lane_s8(to + stride * 2, from, 2);
2915 vst1q_lane_s8(to + stride * 3, from, 3);
2916 vst1q_lane_s8(to + stride * 4, from, 4);
2917 vst1q_lane_s8(to + stride * 5, from, 5);
2918 vst1q_lane_s8(to + stride * 6, from, 6);
2919 vst1q_lane_s8(to + stride * 7, from, 7);
2920 vst1q_lane_s8(to + stride * 8, from, 8);
2921 vst1q_lane_s8(to + stride * 9, from, 9);
2922 vst1q_lane_s8(to + stride * 10, from, 10);
2923 vst1q_lane_s8(to + stride * 11, from, 11);
2924 vst1q_lane_s8(to + stride * 12, from, 12);
2925 vst1q_lane_s8(to + stride * 13, from, 13);
2926 vst1q_lane_s8(to + stride * 14, from, 14);
2927 vst1q_lane_s8(to + stride * 15, from, 15);
2930 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet4uc>(uint8_t* to,
const Packet4uc& from,
2932 for (
int i = 0; i != 4; i++) *(to + i * stride) =
reinterpret_cast<const uint8_t*
>(&from)[i];
2935 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet8uc>(uint8_t* to,
const Packet8uc& from,
2937 vst1_lane_u8(to + stride * 0, from, 0);
2938 vst1_lane_u8(to + stride * 1, from, 1);
2939 vst1_lane_u8(to + stride * 2, from, 2);
2940 vst1_lane_u8(to + stride * 3, from, 3);
2941 vst1_lane_u8(to + stride * 4, from, 4);
2942 vst1_lane_u8(to + stride * 5, from, 5);
2943 vst1_lane_u8(to + stride * 6, from, 6);
2944 vst1_lane_u8(to + stride * 7, from, 7);
2947 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet16uc>(uint8_t* to,
const Packet16uc& from,
2949 vst1q_lane_u8(to + stride * 0, from, 0);
2950 vst1q_lane_u8(to + stride * 1, from, 1);
2951 vst1q_lane_u8(to + stride * 2, from, 2);
2952 vst1q_lane_u8(to + stride * 3, from, 3);
2953 vst1q_lane_u8(to + stride * 4, from, 4);
2954 vst1q_lane_u8(to + stride * 5, from, 5);
2955 vst1q_lane_u8(to + stride * 6, from, 6);
2956 vst1q_lane_u8(to + stride * 7, from, 7);
2957 vst1q_lane_u8(to + stride * 8, from, 8);
2958 vst1q_lane_u8(to + stride * 9, from, 9);
2959 vst1q_lane_u8(to + stride * 10, from, 10);
2960 vst1q_lane_u8(to + stride * 11, from, 11);
2961 vst1q_lane_u8(to + stride * 12, from, 12);
2962 vst1q_lane_u8(to + stride * 13, from, 13);
2963 vst1q_lane_u8(to + stride * 14, from, 14);
2964 vst1q_lane_u8(to + stride * 15, from, 15);
2967 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int16_t, Packet4s>(int16_t* to,
const Packet4s& from,
2969 vst1_lane_s16(to + stride * 0, from, 0);
2970 vst1_lane_s16(to + stride * 1, from, 1);
2971 vst1_lane_s16(to + stride * 2, from, 2);
2972 vst1_lane_s16(to + stride * 3, from, 3);
2975 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int16_t, Packet8s>(int16_t* to,
const Packet8s& from,
2977 vst1q_lane_s16(to + stride * 0, from, 0);
2978 vst1q_lane_s16(to + stride * 1, from, 1);
2979 vst1q_lane_s16(to + stride * 2, from, 2);
2980 vst1q_lane_s16(to + stride * 3, from, 3);
2981 vst1q_lane_s16(to + stride * 4, from, 4);
2982 vst1q_lane_s16(to + stride * 5, from, 5);
2983 vst1q_lane_s16(to + stride * 6, from, 6);
2984 vst1q_lane_s16(to + stride * 7, from, 7);
2987 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint16_t, Packet4us>(uint16_t* to,
const Packet4us& from,
2989 vst1_lane_u16(to + stride * 0, from, 0);
2990 vst1_lane_u16(to + stride * 1, from, 1);
2991 vst1_lane_u16(to + stride * 2, from, 2);
2992 vst1_lane_u16(to + stride * 3, from, 3);
2995 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint16_t, Packet8us>(uint16_t* to,
const Packet8us& from,
2997 vst1q_lane_u16(to + stride * 0, from, 0);
2998 vst1q_lane_u16(to + stride * 1, from, 1);
2999 vst1q_lane_u16(to + stride * 2, from, 2);
3000 vst1q_lane_u16(to + stride * 3, from, 3);
3001 vst1q_lane_u16(to + stride * 4, from, 4);
3002 vst1q_lane_u16(to + stride * 5, from, 5);
3003 vst1q_lane_u16(to + stride * 6, from, 6);
3004 vst1q_lane_u16(to + stride * 7, from, 7);
3007 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int32_t, Packet2i>(int32_t* to,
const Packet2i& from,
3009 vst1_lane_s32(to + stride * 0, from, 0);
3010 vst1_lane_s32(to + stride * 1, from, 1);
3013 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
3015 vst1q_lane_s32(to + stride * 0, from, 0);
3016 vst1q_lane_s32(to + stride * 1, from, 1);
3017 vst1q_lane_s32(to + stride * 2, from, 2);
3018 vst1q_lane_s32(to + stride * 3, from, 3);
3021 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet2ui>(uint32_t* to,
const Packet2ui& from,
3023 vst1_lane_u32(to + stride * 0, from, 0);
3024 vst1_lane_u32(to + stride * 1, from, 1);
3027 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet4ui>(uint32_t* to,
const Packet4ui& from,
3029 vst1q_lane_u32(to + stride * 0, from, 0);
3030 vst1q_lane_u32(to + stride * 1, from, 1);
3031 vst1q_lane_u32(to + stride * 2, from, 2);
3032 vst1q_lane_u32(to + stride * 3, from, 3);
3035 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int64_t, Packet2l>(int64_t* to,
const Packet2l& from,
3037 vst1q_lane_s64(to + stride * 0, from, 0);
3038 vst1q_lane_s64(to + stride * 1, from, 1);
3041 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint64_t, Packet2ul>(uint64_t* to,
const Packet2ul& from,
3043 vst1q_lane_u64(to + stride * 0, from, 0);
3044 vst1q_lane_u64(to + stride * 1, from, 1);
3048 EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
3049 EIGEN_ARM_PREFETCH(addr);
3052 EIGEN_STRONG_INLINE
void prefetch<int8_t>(
const int8_t* addr) {
3053 EIGEN_ARM_PREFETCH(addr);
3056 EIGEN_STRONG_INLINE
void prefetch<uint8_t>(
const uint8_t* addr) {
3057 EIGEN_ARM_PREFETCH(addr);
3060 EIGEN_STRONG_INLINE
void prefetch<int16_t>(
const int16_t* addr) {
3061 EIGEN_ARM_PREFETCH(addr);
3064 EIGEN_STRONG_INLINE
void prefetch<uint16_t>(
const uint16_t* addr) {
3065 EIGEN_ARM_PREFETCH(addr);
3068 EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
3069 EIGEN_ARM_PREFETCH(addr);
3072 EIGEN_STRONG_INLINE
void prefetch<uint32_t>(
const uint32_t* addr) {
3073 EIGEN_ARM_PREFETCH(addr);
3076 EIGEN_STRONG_INLINE
void prefetch<int64_t>(
const int64_t* addr) {
3077 EIGEN_ARM_PREFETCH(addr);
3080 EIGEN_STRONG_INLINE
void prefetch<uint64_t>(
const uint64_t* addr) {
3081 EIGEN_ARM_PREFETCH(addr);
3085 EIGEN_STRONG_INLINE
float pfirst<Packet2f>(
const Packet2f& a) {
3086 return vget_lane_f32(a, 0);
3089 EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
3090 return vgetq_lane_f32(a, 0);
3093 EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(
const Packet4c& a) {
3094 return static_cast<int8_t
>(a & 0xff);
3097 EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(
const Packet8c& a) {
3098 return vget_lane_s8(a, 0);
3101 EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(
const Packet16c& a) {
3102 return vgetq_lane_s8(a, 0);
3105 EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(
const Packet4uc& a) {
3106 return static_cast<uint8_t
>(a & 0xff);
3109 EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(
const Packet8uc& a) {
3110 return vget_lane_u8(a, 0);
3113 EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(
const Packet16uc& a) {
3114 return vgetq_lane_u8(a, 0);
3117 EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(
const Packet4s& a) {
3118 return vget_lane_s16(a, 0);
3121 EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(
const Packet8s& a) {
3122 return vgetq_lane_s16(a, 0);
3125 EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(
const Packet4us& a) {
3126 return vget_lane_u16(a, 0);
3129 EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(
const Packet8us& a) {
3130 return vgetq_lane_u16(a, 0);
3133 EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(
const Packet2i& a) {
3134 return vget_lane_s32(a, 0);
3137 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
3138 return vgetq_lane_s32(a, 0);
3141 EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(
const Packet2ui& a) {
3142 return vget_lane_u32(a, 0);
3145 EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
3146 return vgetq_lane_u32(a, 0);
3149 EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
3150 return vgetq_lane_s64(a, 0);
3153 EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(
const Packet2ul& a) {
3154 return vgetq_lane_u64(a, 0);
3158 EIGEN_STRONG_INLINE Packet2f preverse(
const Packet2f& a) {
3159 return vrev64_f32(a);
3162 EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
3163 const float32x4_t a_r64 = vrev64q_f32(a);
3164 return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
3167 EIGEN_STRONG_INLINE Packet4c preverse(
const Packet4c& a) {
3168 return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3171 EIGEN_STRONG_INLINE Packet8c preverse(
const Packet8c& a) {
3172 return vrev64_s8(a);
3175 EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a) {
3176 const int8x16_t a_r64 = vrev64q_s8(a);
3177 return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
3180 EIGEN_STRONG_INLINE Packet4uc preverse(
const Packet4uc& a) {
3181 return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
3184 EIGEN_STRONG_INLINE Packet8uc preverse(
const Packet8uc& a) {
3185 return vrev64_u8(a);
3188 EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a) {
3189 const uint8x16_t a_r64 = vrev64q_u8(a);
3190 return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
3193 EIGEN_STRONG_INLINE Packet4s preverse(
const Packet4s& a) {
3194 return vrev64_s16(a);
3197 EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a) {
3198 const int16x8_t a_r64 = vrev64q_s16(a);
3199 return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
3202 EIGEN_STRONG_INLINE Packet4us preverse(
const Packet4us& a) {
3203 return vrev64_u16(a);
3206 EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a) {
3207 const uint16x8_t a_r64 = vrev64q_u16(a);
3208 return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
3211 EIGEN_STRONG_INLINE Packet2i preverse(
const Packet2i& a) {
3212 return vrev64_s32(a);
3215 EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
3216 const int32x4_t a_r64 = vrev64q_s32(a);
3217 return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
3220 EIGEN_STRONG_INLINE Packet2ui preverse(
const Packet2ui& a) {
3221 return vrev64_u32(a);
3224 EIGEN_STRONG_INLINE Packet4ui preverse(
const Packet4ui& a) {
3225 const uint32x4_t a_r64 = vrev64q_u32(a);
3226 return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
3229 EIGEN_STRONG_INLINE Packet2l preverse(
const Packet2l& a) {
3230 return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
3233 EIGEN_STRONG_INLINE Packet2ul preverse(
const Packet2ul& a) {
3234 return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
3238 EIGEN_STRONG_INLINE Packet2f pabs(
const Packet2f& a) {
3242 EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
3243 return vabsq_f32(a);
3246 EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(
const Packet4c& a) {
3247 return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3250 EIGEN_STRONG_INLINE Packet8c pabs(
const Packet8c& a) {
3254 EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
3258 EIGEN_STRONG_INLINE Packet4uc pabs(
const Packet4uc& a) {
3262 EIGEN_STRONG_INLINE Packet8uc pabs(
const Packet8uc& a) {
3266 EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
3270 EIGEN_STRONG_INLINE Packet4s pabs(
const Packet4s& a) {
3274 EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
3275 return vabsq_s16(a);
3278 EIGEN_STRONG_INLINE Packet4us pabs(
const Packet4us& a) {
3282 EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
3286 EIGEN_STRONG_INLINE Packet2i pabs(
const Packet2i& a) {
3290 EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
3291 return vabsq_s32(a);
3294 EIGEN_STRONG_INLINE Packet2ui pabs(
const Packet2ui& a) {
3298 EIGEN_STRONG_INLINE Packet4ui pabs(
const Packet4ui& a) {
3302 EIGEN_STRONG_INLINE Packet2l pabs(
const Packet2l& a) {
3303 #if EIGEN_ARCH_ARM64 3304 return vabsq_s64(a);
3306 return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
3310 EIGEN_STRONG_INLINE Packet2ul pabs(
const Packet2ul& a) {
3315 EIGEN_STRONG_INLINE Packet2f psignbit(
const Packet2f& a) {
3316 return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
3319 EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
3320 return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
3324 EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(
const Packet2f& a, Packet2f& exponent) {
3325 return pfrexp_generic(a, exponent);
3328 EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
3329 return pfrexp_generic(a, exponent);
3333 EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(
const Packet2f& a,
const Packet2f& exponent) {
3334 return pldexp_generic(a, exponent);
3337 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
3338 return pldexp_generic(a, exponent);
3341 #if EIGEN_ARCH_ARM64 3343 EIGEN_STRONG_INLINE
float predux<Packet2f>(
const Packet2f& a) {
3344 return vaddv_f32(a);
3347 EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
3348 return vaddvq_f32(a);
3352 EIGEN_STRONG_INLINE
float predux<Packet2f>(
const Packet2f& a) {
3353 return vget_lane_f32(vpadd_f32(a, a), 0);
3356 EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
3357 const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
3358 return vget_lane_f32(vpadd_f32(sum, sum), 0);
3362 EIGEN_STRONG_INLINE int8_t predux<Packet4c>(
const Packet4c& a) {
3363 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3364 int8x8_t sum = vpadd_s8(a_dup, a_dup);
3365 sum = vpadd_s8(sum, sum);
3366 return vget_lane_s8(sum, 0);
3368 #if EIGEN_ARCH_ARM64 3370 EIGEN_STRONG_INLINE int8_t predux<Packet8c>(
const Packet8c& a) {
3374 EIGEN_STRONG_INLINE int8_t predux<Packet16c>(
const Packet16c& a) {
3375 return vaddvq_s8(a);
3379 EIGEN_STRONG_INLINE int8_t predux<Packet8c>(
const Packet8c& a) {
3380 int8x8_t sum = vpadd_s8(a, a);
3381 sum = vpadd_s8(sum, sum);
3382 sum = vpadd_s8(sum, sum);
3383 return vget_lane_s8(sum, 0);
3386 EIGEN_STRONG_INLINE int8_t predux<Packet16c>(
const Packet16c& a) {
3387 int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
3388 sum = vpadd_s8(sum, sum);
3389 sum = vpadd_s8(sum, sum);
3390 sum = vpadd_s8(sum, sum);
3391 return vget_lane_s8(sum, 0);
3395 EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(
const Packet4uc& a) {
3396 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3397 uint8x8_t sum = vpadd_u8(a_dup, a_dup);
3398 sum = vpadd_u8(sum, sum);
3399 return vget_lane_u8(sum, 0);
3401 #if EIGEN_ARCH_ARM64 3403 EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(
const Packet8uc& a) {
3407 EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(
const Packet16uc& a) {
3408 return vaddvq_u8(a);
3411 EIGEN_STRONG_INLINE int16_t predux<Packet4s>(
const Packet4s& a) {
3412 return vaddv_s16(a);
3415 EIGEN_STRONG_INLINE int16_t predux<Packet8s>(
const Packet8s& a) {
3416 return vaddvq_s16(a);
3419 EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(
const Packet4us& a) {
3420 return vaddv_u16(a);
3423 EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(
const Packet8us& a) {
3424 return vaddvq_u16(a);
3427 EIGEN_STRONG_INLINE int32_t predux<Packet2i>(
const Packet2i& a) {
3428 return vaddv_s32(a);
3431 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
3432 return vaddvq_s32(a);
3435 EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(
const Packet2ui& a) {
3436 return vaddv_u32(a);
3439 EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(
const Packet4ui& a) {
3440 return vaddvq_u32(a);
3443 EIGEN_STRONG_INLINE int64_t predux<Packet2l>(
const Packet2l& a) {
3444 return vaddvq_s64(a);
3447 EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(
const Packet2ul& a) {
3448 return vaddvq_u64(a);
3452 EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(
const Packet8uc& a) {
3453 uint8x8_t sum = vpadd_u8(a, a);
3454 sum = vpadd_u8(sum, sum);
3455 sum = vpadd_u8(sum, sum);
3456 return vget_lane_u8(sum, 0);
3459 EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(
const Packet16uc& a) {
3460 uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
3461 sum = vpadd_u8(sum, sum);
3462 sum = vpadd_u8(sum, sum);
3463 sum = vpadd_u8(sum, sum);
3464 return vget_lane_u8(sum, 0);
3467 EIGEN_STRONG_INLINE int16_t predux<Packet4s>(
const Packet4s& a) {
3468 const int16x4_t sum = vpadd_s16(a, a);
3469 return vget_lane_s16(vpadd_s16(sum, sum), 0);
3472 EIGEN_STRONG_INLINE int16_t predux<Packet8s>(
const Packet8s& a) {
3473 int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
3474 sum = vpadd_s16(sum, sum);
3475 sum = vpadd_s16(sum, sum);
3476 return vget_lane_s16(sum, 0);
3479 EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(
const Packet4us& a) {
3480 const uint16x4_t sum = vpadd_u16(a, a);
3481 return vget_lane_u16(vpadd_u16(sum, sum), 0);
3484 EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(
const Packet8us& a) {
3485 uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
3486 sum = vpadd_u16(sum, sum);
3487 sum = vpadd_u16(sum, sum);
3488 return vget_lane_u16(sum, 0);
3491 EIGEN_STRONG_INLINE int32_t predux<Packet2i>(
const Packet2i& a) {
3492 return vget_lane_s32(vpadd_s32(a, a), 0);
3495 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
3496 const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
3497 return vget_lane_s32(vpadd_s32(sum, sum), 0);
3500 EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(
const Packet2ui& a) {
3501 return vget_lane_u32(vpadd_u32(a, a), 0);
3504 EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(
const Packet4ui& a) {
3505 const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
3506 return vget_lane_u32(vpadd_u32(sum, sum), 0);
3509 EIGEN_STRONG_INLINE int64_t predux<Packet2l>(
const Packet2l& a) {
3510 return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
3513 EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(
const Packet2ul& a) {
3514 return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
3519 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(
const Packet8c& a) {
3520 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
3523 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(
const Packet16c& a) {
3524 return vadd_s8(vget_high_s8(a), vget_low_s8(a));
3527 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(
const Packet8uc& a) {
3528 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
3531 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(
const Packet16uc& a) {
3532 return vadd_u8(vget_high_u8(a), vget_low_u8(a));
3535 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(
const Packet8s& a) {
3536 return vadd_s16(vget_high_s16(a), vget_low_s16(a));
3539 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(
const Packet8us& a) {
3540 return vadd_u16(vget_high_u16(a), vget_low_u16(a));
3546 EIGEN_STRONG_INLINE
float predux_mul<Packet2f>(
const Packet2f& a) {
3547 return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
3550 EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
3551 return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
3554 EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(
const Packet4c& a) {
3555 int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
3556 prod = vmul_s8(prod, vrev16_s8(prod));
3557 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
3560 EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(
const Packet8c& a) {
3561 int8x8_t prod = vmul_s8(a, vrev16_s8(a));
3562 prod = vmul_s8(prod, vrev32_s8(prod));
3563 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
3566 EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(
const Packet16c& a) {
3567 return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
3570 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(
const Packet4uc& a) {
3571 uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
3572 prod = vmul_u8(prod, vrev16_u8(prod));
3573 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
3576 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(
const Packet8uc& a) {
3577 uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
3578 prod = vmul_u8(prod, vrev32_u8(prod));
3579 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
3582 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(
const Packet16uc& a) {
3583 return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
3586 EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(
const Packet4s& a) {
3587 const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
3588 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3591 EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(
const Packet8s& a) {
3595 prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
3597 prod = vmul_s16(prod, vrev32_s16(prod));
3599 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3602 EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(
const Packet4us& a) {
3603 const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
3604 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3607 EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(
const Packet8us& a) {
3611 prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
3613 prod = vmul_u16(prod, vrev32_u16(prod));
3615 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3618 EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(
const Packet2i& a) {
3619 return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
3622 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
3623 return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
3626 EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(
const Packet2ui& a) {
3627 return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
3630 EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(
const Packet4ui& a) {
3631 return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
3634 EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(
const Packet2l& a) {
3635 return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
3638 EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(
const Packet2ul& a) {
3639 return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
3643 #if EIGEN_ARCH_ARM64 3645 EIGEN_STRONG_INLINE
float predux_min<Packet2f>(
const Packet2f& a) {
3646 return vminv_f32(a);
3649 EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
3650 return vminvq_f32(a);
3654 EIGEN_STRONG_INLINE
float predux_min<Packet2f>(
const Packet2f& a) {
3655 return vget_lane_f32(vpmin_f32(a, a), 0);
3658 EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
3659 const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
3660 return vget_lane_f32(vpmin_f32(min, min), 0);
3664 EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(
const Packet4c& a) {
3665 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3666 int8x8_t min = vpmin_s8(a_dup, a_dup);
3667 min = vpmin_s8(min, min);
3668 return vget_lane_s8(min, 0);
3670 #if EIGEN_ARCH_ARM64 3672 EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(
const Packet8c& a) {
3676 EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(
const Packet16c& a) {
3677 return vminvq_s8(a);
3681 EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(
const Packet8c& a) {
3682 int8x8_t min = vpmin_s8(a, a);
3683 min = vpmin_s8(min, min);
3684 min = vpmin_s8(min, min);
3685 return vget_lane_s8(min, 0);
3688 EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(
const Packet16c& a) {
3689 int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
3690 min = vpmin_s8(min, min);
3691 min = vpmin_s8(min, min);
3692 min = vpmin_s8(min, min);
3693 return vget_lane_s8(min, 0);
3697 EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(
const Packet4uc& a) {
3698 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3699 uint8x8_t min = vpmin_u8(a_dup, a_dup);
3700 min = vpmin_u8(min, min);
3701 return vget_lane_u8(min, 0);
3703 #if EIGEN_ARCH_ARM64 3705 EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(
const Packet8uc& a) {
3709 EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(
const Packet16uc& a) {
3710 return vminvq_u8(a);
3713 EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(
const Packet4s& a) {
3714 return vminv_s16(a);
3717 EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(
const Packet8s& a) {
3718 return vminvq_s16(a);
3721 EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(
const Packet4us& a) {
3722 return vminv_u16(a);
3725 EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(
const Packet8us& a) {
3726 return vminvq_u16(a);
3729 EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(
const Packet2i& a) {
3730 return vminv_s32(a);
3733 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
3734 return vminvq_s32(a);
3737 EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(
const Packet2ui& a) {
3738 return vminv_u32(a);
3741 EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(
const Packet4ui& a) {
3742 return vminvq_u32(a);
3746 EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(
const Packet8uc& a) {
3747 uint8x8_t min = vpmin_u8(a, a);
3748 min = vpmin_u8(min, min);
3749 min = vpmin_u8(min, min);
3750 return vget_lane_u8(min, 0);
3753 EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(
const Packet16uc& a) {
3754 uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
3755 min = vpmin_u8(min, min);
3756 min = vpmin_u8(min, min);
3757 min = vpmin_u8(min, min);
3758 return vget_lane_u8(min, 0);
3761 EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(
const Packet4s& a) {
3762 const int16x4_t min = vpmin_s16(a, a);
3763 return vget_lane_s16(vpmin_s16(min, min), 0);
3766 EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(
const Packet8s& a) {
3767 int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
3768 min = vpmin_s16(min, min);
3769 min = vpmin_s16(min, min);
3770 return vget_lane_s16(min, 0);
3773 EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(
const Packet4us& a) {
3774 const uint16x4_t min = vpmin_u16(a, a);
3775 return vget_lane_u16(vpmin_u16(min, min), 0);
3778 EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(
const Packet8us& a) {
3779 uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
3780 min = vpmin_u16(min, min);
3781 min = vpmin_u16(min, min);
3782 return vget_lane_u16(min, 0);
3785 EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(
const Packet2i& a) {
3786 return vget_lane_s32(vpmin_s32(a, a), 0);
3789 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
3790 const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
3791 return vget_lane_s32(vpmin_s32(min, min), 0);
3794 EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(
const Packet2ui& a) {
3795 return vget_lane_u32(vpmin_u32(a, a), 0);
3798 EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(
const Packet4ui& a) {
3799 const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
3800 return vget_lane_u32(vpmin_u32(min, min), 0);
3804 EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(
const Packet2l& a) {
3805 return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3808 EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(
const Packet2ul& a) {
3809 return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3813 #if EIGEN_ARCH_ARM64 3815 EIGEN_STRONG_INLINE
float predux_max<Packet2f>(
const Packet2f& a) {
3816 return vmaxv_f32(a);
3819 EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
3820 return vmaxvq_f32(a);
3824 EIGEN_STRONG_INLINE
float predux_max<Packet2f>(
const Packet2f& a) {
3825 return vget_lane_f32(vpmax_f32(a, a), 0);
3828 EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
3829 const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
3830 return vget_lane_f32(vpmax_f32(max, max), 0);
3834 EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(
const Packet4c& a) {
3835 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3836 int8x8_t max = vpmax_s8(a_dup, a_dup);
3837 max = vpmax_s8(max, max);
3838 return vget_lane_s8(max, 0);
3840 #if EIGEN_ARCH_ARM64 3842 EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(
const Packet8c& a) {
3846 EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(
const Packet16c& a) {
3847 return vmaxvq_s8(a);
3851 EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(
const Packet8c& a) {
3852 int8x8_t max = vpmax_s8(a, a);
3853 max = vpmax_s8(max, max);
3854 max = vpmax_s8(max, max);
3855 return vget_lane_s8(max, 0);
3858 EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(
const Packet16c& a) {
3859 int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
3860 max = vpmax_s8(max, max);
3861 max = vpmax_s8(max, max);
3862 max = vpmax_s8(max, max);
3863 return vget_lane_s8(max, 0);
3867 EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(
const Packet4uc& a) {
3868 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3869 uint8x8_t max = vpmax_u8(a_dup, a_dup);
3870 max = vpmax_u8(max, max);
3871 return vget_lane_u8(max, 0);
3873 #if EIGEN_ARCH_ARM64 3875 EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(
const Packet8uc& a) {
3879 EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(
const Packet16uc& a) {
3880 return vmaxvq_u8(a);
3883 EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(
const Packet4s& a) {
3884 return vmaxv_s16(a);
3887 EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(
const Packet8s& a) {
3888 return vmaxvq_s16(a);
3891 EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(
const Packet4us& a) {
3892 return vmaxv_u16(a);
3895 EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(
const Packet8us& a) {
3896 return vmaxvq_u16(a);
3899 EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(
const Packet2i& a) {
3900 return vmaxv_s32(a);
3903 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
3904 return vmaxvq_s32(a);
3907 EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(
const Packet2ui& a) {
3908 return vmaxv_u32(a);
3911 EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(
const Packet4ui& a) {
3912 return vmaxvq_u32(a);
3916 EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(
const Packet8uc& a) {
3917 uint8x8_t max = vpmax_u8(a, a);
3918 max = vpmax_u8(max, max);
3919 max = vpmax_u8(max, max);
3920 return vget_lane_u8(max, 0);
3923 EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(
const Packet16uc& a) {
3924 uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
3925 max = vpmax_u8(max, max);
3926 max = vpmax_u8(max, max);
3927 max = vpmax_u8(max, max);
3928 return vget_lane_u8(max, 0);
3931 EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(
const Packet4s& a) {
3932 const int16x4_t max = vpmax_s16(a, a);
3933 return vget_lane_s16(vpmax_s16(max, max), 0);
3936 EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(
const Packet8s& a) {
3937 int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
3938 max = vpmax_s16(max, max);
3939 max = vpmax_s16(max, max);
3940 return vget_lane_s16(max, 0);
3943 EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(
const Packet4us& a) {
3944 const uint16x4_t max = vpmax_u16(a, a);
3945 return vget_lane_u16(vpmax_u16(max, max), 0);
3948 EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(
const Packet8us& a) {
3949 uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
3950 max = vpmax_u16(max, max);
3951 max = vpmax_u16(max, max);
3952 return vget_lane_u16(max, 0);
3955 EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(
const Packet2i& a) {
3956 return vget_lane_s32(vpmax_s32(a, a), 0);
3959 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
3960 const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
3961 return vget_lane_s32(vpmax_s32(max, max), 0);
3964 EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(
const Packet2ui& a) {
3965 return vget_lane_u32(vpmax_u32(a, a), 0);
3968 EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(
const Packet4ui& a) {
3969 const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
3970 return vget_lane_u32(vpmax_u32(max, max), 0);
3974 EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(
const Packet2l& a) {
3975 return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3978 EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(
const Packet2ul& a) {
3979 return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3983 EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x) {
3984 uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
3985 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
3991 template <
typename Packet>
3992 void zip_in_place(Packet& p1, Packet& p2);
3995 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
3996 const float32x2x2_t tmp = vzip_f32(p1, p2);
4002 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
4003 const float32x4x2_t tmp = vzipq_f32(p1, p2);
4009 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
4010 const int8x8x2_t tmp = vzip_s8(p1, p2);
4016 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
4017 const int8x16x2_t tmp = vzipq_s8(p1, p2);
4023 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
4024 const uint8x8x2_t tmp = vzip_u8(p1, p2);
4030 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
4031 const uint8x16x2_t tmp = vzipq_u8(p1, p2);
4037 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
4038 const int32x2x2_t tmp = vzip_s32(p1, p2);
4044 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
4045 const int32x4x2_t tmp = vzipq_s32(p1, p2);
4051 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
4052 const uint32x2x2_t tmp = vzip_u32(p1, p2);
4058 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
4059 const uint32x4x2_t tmp = vzipq_u32(p1, p2);
4065 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
4066 const int16x4x2_t tmp = vzip_s16(p1, p2);
4072 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
4073 const int16x8x2_t tmp = vzipq_s16(p1, p2);
4079 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
4080 const uint16x4x2_t tmp = vzip_u16(p1, p2);
4086 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
4087 const uint16x8x2_t tmp = vzipq_u16(p1, p2);
4092 template <
typename Packet>
4093 EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
4094 zip_in_place(kernel.packet[0], kernel.packet[1]);
4097 template <
typename Packet>
4098 EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
4099 zip_in_place(kernel.packet[0], kernel.packet[2]);
4100 zip_in_place(kernel.packet[1], kernel.packet[3]);
4101 zip_in_place(kernel.packet[0], kernel.packet[1]);
4102 zip_in_place(kernel.packet[2], kernel.packet[3]);
4105 template <
typename Packet>
4106 EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
4107 zip_in_place(kernel.packet[0], kernel.packet[4]);
4108 zip_in_place(kernel.packet[1], kernel.packet[5]);
4109 zip_in_place(kernel.packet[2], kernel.packet[6]);
4110 zip_in_place(kernel.packet[3], kernel.packet[7]);
4112 zip_in_place(kernel.packet[0], kernel.packet[2]);
4113 zip_in_place(kernel.packet[1], kernel.packet[3]);
4114 zip_in_place(kernel.packet[4], kernel.packet[6]);
4115 zip_in_place(kernel.packet[5], kernel.packet[7]);
4117 zip_in_place(kernel.packet[0], kernel.packet[1]);
4118 zip_in_place(kernel.packet[2], kernel.packet[3]);
4119 zip_in_place(kernel.packet[4], kernel.packet[5]);
4120 zip_in_place(kernel.packet[6], kernel.packet[7]);
4123 template <
typename Packet>
4124 EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
4126 for (
int i = 0; i < 4; ++i) {
4127 const int m = (1 << i);
4129 for (
int j = 0; j < m; ++j) {
4130 const int n = (1 << (3 - i));
4132 for (
int k = 0; k < n; ++k) {
4133 const int idx = 2 * j * n + k;
4134 zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
4142 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
4143 detail::ptranspose_impl(kernel);
4145 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
4146 detail::ptranspose_impl(kernel);
4149 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
4150 const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
4151 const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
4153 const int8x8x2_t zip8 = vzip_s8(a, b);
4154 const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
4156 kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
4157 kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
4158 kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
4159 kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
4161 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
4162 detail::ptranspose_impl(kernel);
4164 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
4165 detail::ptranspose_impl(kernel);
4167 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
4168 detail::ptranspose_impl(kernel);
4170 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
4171 detail::ptranspose_impl(kernel);
4173 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
4174 detail::ptranspose_impl(kernel);
4177 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
4178 const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
4179 const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
4181 const uint8x8x2_t zip8 = vzip_u8(a, b);
4182 const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
4184 kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
4185 kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
4186 kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
4187 kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
4189 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
4190 detail::ptranspose_impl(kernel);
4192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
4193 detail::ptranspose_impl(kernel);
4195 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
4196 detail::ptranspose_impl(kernel);
4198 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
4199 detail::ptranspose_impl(kernel);
4201 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
4202 detail::ptranspose_impl(kernel);
4205 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
4206 detail::ptranspose_impl(kernel);
4208 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
4209 detail::ptranspose_impl(kernel);
4211 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
4212 detail::ptranspose_impl(kernel);
4215 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
4216 detail::ptranspose_impl(kernel);
4218 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
4219 detail::ptranspose_impl(kernel);
4221 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
4222 detail::ptranspose_impl(kernel);
4225 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
4226 detail::ptranspose_impl(kernel);
4228 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
4229 detail::ptranspose_impl(kernel);
4231 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
4232 detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
4234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
4235 detail::ptranspose_impl(kernel);
4238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
4239 #if EIGEN_ARCH_ARM64 4240 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
4241 kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
4242 kernel.packet[0] = tmp1;
4244 const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
4245 {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
4247 kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
4248 kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
4251 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
4252 #if EIGEN_ARCH_ARM64 4253 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
4254 kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
4255 kernel.packet[0] = tmp1;
4257 const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
4258 {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
4260 kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
4261 kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
4266 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(
const Packet2f& mask,
const Packet2f& a,
const Packet2f& b) {
4267 return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
4270 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
4271 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
4274 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(
const Packet8c& mask,
const Packet8c& a,
const Packet8c& b) {
4275 return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
4278 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(
const Packet16c& mask,
const Packet16c& a,
const Packet16c& b) {
4279 return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
4282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(
const Packet8uc& mask,
const Packet8uc& a,
const Packet8uc& b) {
4283 return vbsl_u8(mask, a, b);
4286 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(
const Packet16uc& mask,
const Packet16uc& a,
4287 const Packet16uc& b) {
4288 return vbslq_u8(mask, a, b);
4291 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(
const Packet4s& mask,
const Packet4s& a,
const Packet4s& b) {
4292 return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
4295 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(
const Packet8s& mask,
const Packet8s& a,
const Packet8s& b) {
4296 return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
4299 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(
const Packet4us& mask,
const Packet4us& a,
const Packet4us& b) {
4300 return vbsl_u16(mask, a, b);
4303 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(
const Packet8us& mask,
const Packet8us& a,
const Packet8us& b) {
4304 return vbslq_u16(mask, a, b);
4307 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(
const Packet2i& mask,
const Packet2i& a,
const Packet2i& b) {
4308 return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
4311 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(
const Packet4i& mask,
const Packet4i& a,
const Packet4i& b) {
4312 return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
4315 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(
const Packet2ui& mask,
const Packet2ui& a,
const Packet2ui& b) {
4316 return vbsl_u32(mask, a, b);
4319 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(
const Packet4ui& mask,
const Packet4ui& a,
const Packet4ui& b) {
4320 return vbslq_u32(mask, a, b);
4323 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(
const Packet2l& mask,
const Packet2l& a,
const Packet2l& b) {
4324 return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
4327 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(
const Packet2ul& mask,
const Packet2ul& a,
const Packet2ul& b) {
4328 return vbslq_u64(mask, a, b);
4332 #if EIGEN_ARCH_ARMV8 4334 EIGEN_STRONG_INLINE Packet2f print<Packet2f>(
const Packet2f& a) {
4335 return vrndn_f32(a);
4339 EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
4340 return vrndnq_f32(a);
4344 EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(
const Packet2f& a) {
4345 return vrndm_f32(a);
4349 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
4350 return vrndmq_f32(a);
4354 EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(
const Packet2f& a) {
4355 return vrndp_f32(a);
4359 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
4360 return vrndpq_f32(a);
4364 EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(
const Packet2f& a) {
4365 return vrnda_f32(a);
4369 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
4370 return vrndaq_f32(a);
4374 EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(
const Packet2f& a) {
4379 EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
4380 return vrndq_f32(a);
4391 EIGEN_STRONG_INLINE Packet4uc psqrt(
const Packet4uc& a) {
4392 uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
4393 uint8x8_t res = vdup_n_u8(0);
4394 uint8x8_t add = vdup_n_u8(0x8);
4395 for (
int i = 0; i < 4; i++) {
4396 const uint8x8_t temp = vorr_u8(res, add);
4397 res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
4398 add = vshr_n_u8(add, 1);
4400 return vget_lane_u32(vreinterpret_u32_u8(res), 0);
4404 EIGEN_STRONG_INLINE Packet8uc psqrt(
const Packet8uc& a) {
4405 uint8x8_t res = vdup_n_u8(0);
4406 uint8x8_t add = vdup_n_u8(0x8);
4407 for (
int i = 0; i < 4; i++) {
4408 const uint8x8_t temp = vorr_u8(res, add);
4409 res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
4410 add = vshr_n_u8(add, 1);
4416 EIGEN_STRONG_INLINE Packet16uc psqrt(
const Packet16uc& a) {
4417 uint8x16_t res = vdupq_n_u8(0);
4418 uint8x16_t add = vdupq_n_u8(0x8);
4419 for (
int i = 0; i < 4; i++) {
4420 const uint8x16_t temp = vorrq_u8(res, add);
4421 res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
4422 add = vshrq_n_u8(add, 1);
4428 EIGEN_STRONG_INLINE Packet4us psqrt(
const Packet4us& a) {
4429 uint16x4_t res = vdup_n_u16(0);
4430 uint16x4_t add = vdup_n_u16(0x80);
4431 for (
int i = 0; i < 8; i++) {
4432 const uint16x4_t temp = vorr_u16(res, add);
4433 res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
4434 add = vshr_n_u16(add, 1);
4440 EIGEN_STRONG_INLINE Packet8us psqrt(
const Packet8us& a) {
4441 uint16x8_t res = vdupq_n_u16(0);
4442 uint16x8_t add = vdupq_n_u16(0x80);
4443 for (
int i = 0; i < 8; i++) {
4444 const uint16x8_t temp = vorrq_u16(res, add);
4445 res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
4446 add = vshrq_n_u16(add, 1);
4452 EIGEN_STRONG_INLINE Packet2ui psqrt(
const Packet2ui& a) {
4453 uint32x2_t res = vdup_n_u32(0);
4454 uint32x2_t add = vdup_n_u32(0x8000);
4455 for (
int i = 0; i < 16; i++) {
4456 const uint32x2_t temp = vorr_u32(res, add);
4457 res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
4458 add = vshr_n_u32(add, 1);
4464 EIGEN_STRONG_INLINE Packet4ui psqrt(
const Packet4ui& a) {
4465 uint32x4_t res = vdupq_n_u32(0);
4466 uint32x4_t add = vdupq_n_u32(0x8000);
4467 for (
int i = 0; i < 16; i++) {
4468 const uint32x4_t temp = vorrq_u32(res, add);
4469 res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
4470 add = vshrq_n_u32(add, 1);
4475 EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(
const Packet4f& a) {
4478 float32x4_t result = vrsqrteq_f32(a);
4479 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4480 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4484 EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(
const Packet2f& a) {
4487 float32x2_t result = vrsqrte_f32(a);
4488 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4489 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4493 template <
typename Packet>
4494 Packet prsqrt_float_common(
const Packet& a) {
4495 const Packet cst_zero = pzero(a);
4496 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4497 Packet return_zero = pcmp_eq(a, cst_inf);
4498 Packet return_inf = pcmp_eq(a, cst_zero);
4499 Packet result = prsqrt_float_unsafe(a);
4500 result = pselect(return_inf, por(cst_inf, a), result);
4501 result = pandnot(result, return_zero);
4506 EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
4507 return prsqrt_float_common(a);
4511 EIGEN_STRONG_INLINE Packet2f prsqrt(
const Packet2f& a) {
4512 return prsqrt_float_common(a);
4516 EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(
const Packet4f& a) {
4518 float32x4_t result = vrecpeq_f32(a);
4519 result = vmulq_f32(vrecpsq_f32(a, result), result);
4520 result = vmulq_f32(vrecpsq_f32(a, result), result);
4525 EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(
const Packet2f& a) {
4527 float32x2_t result = vrecpe_f32(a);
4528 result = vmul_f32(vrecps_f32(a, result), result);
4529 result = vmul_f32(vrecps_f32(a, result), result);
4534 #if EIGEN_ARCH_ARM64 4536 EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
4537 return vsqrtq_f32(a);
4541 EIGEN_STRONG_INLINE Packet2f psqrt(
const Packet2f& a) {
4542 return vsqrt_f32(a);
4546 EIGEN_STRONG_INLINE Packet4f pdiv(
const Packet4f& a,
const Packet4f& b) {
4547 return vdivq_f32(a, b);
4551 EIGEN_STRONG_INLINE Packet2f pdiv(
const Packet2f& a,
const Packet2f& b) {
4552 return vdiv_f32(a, b);
4555 template <
typename Packet>
4556 EIGEN_STRONG_INLINE Packet psqrt_float_common(
const Packet& a) {
4557 const Packet cst_zero = pzero(a);
4558 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4560 Packet result = pmul(a, prsqrt_float_unsafe(a));
4561 Packet a_is_zero = pcmp_eq(a, cst_zero);
4562 Packet a_is_inf = pcmp_eq(a, cst_inf);
4563 Packet return_a = por(a_is_zero, a_is_inf);
4565 result = pselect(return_a, a, result);
4570 EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
4571 return psqrt_float_common(a);
4575 EIGEN_STRONG_INLINE Packet2f psqrt(
const Packet2f& a) {
4576 return psqrt_float_common(a);
4579 template <
typename Packet>
4580 EIGEN_STRONG_INLINE Packet pdiv_float_common(
const Packet& a,
const Packet& b) {
4585 const Packet cst_one = pset1<Packet>(1.0f);
4586 const Packet cst_quarter = pset1<Packet>(0.25f);
4587 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
4589 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
4590 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
4591 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
4596 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
4597 return pdiv_float_common(a, b);
4601 EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
4602 return pdiv_float_common(a, b);
4610 typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
4613 struct is_arithmetic<Packet4bf> {
4614 enum { value =
true };
4618 struct packet_traits<bfloat16> : default_packet_traits {
4619 typedef Packet4bf type;
4620 typedef Packet4bf half;
4623 AlignedOnScalar = 1,
4642 HasSin = EIGEN_FAST_MATH,
4643 HasCos = EIGEN_FAST_MATH,
4647 HasTanh = EIGEN_FAST_MATH,
4648 HasErf = EIGEN_FAST_MATH,
4655 struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
4659 EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
4660 const uint16x4x2_t tmp = vzip_u16(p1, p2);
4666 EIGEN_STRONG_INLINE Packet4bf F32ToBf16(
const Packet4f& p) {
4669 Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
4672 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
4675 Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
4678 input = vaddq_u32(input, rounding_bias);
4681 input = vshrq_n_u32(input, 16);
4684 const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
4685 const Packet4ui mask = vceqq_f32(p, p);
4686 input = vbslq_u32(mask, input, bf16_nan);
4689 return vmovn_u32(input);
4692 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(
const Packet4bf& p) {
4693 return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
4696 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(
const Packet4f& p) {
return vmovn_u32(vreinterpretq_u32_f32(p)); }
4699 EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(
const bfloat16& from) {
4700 return Packet4bf(pset1<Packet4us>(from.value));
4704 EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(
const Packet4bf& from) {
4705 return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
4709 EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(
const bfloat16* from) {
4711 pload<Packet4us>(reinterpret_cast<const uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(from))));
4715 EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(
const bfloat16* from) {
4716 return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4720 EIGEN_STRONG_INLINE
void pstore<bfloat16>(bfloat16* to,
const Packet4bf& from) {
4721 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(
4722 reinterpret_cast<uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(to)), from);
4726 EIGEN_STRONG_INLINE
void pstoreu<bfloat16>(bfloat16* to,
const Packet4bf& from) {
4727 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4731 EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(
const bfloat16* from) {
4732 return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4736 EIGEN_STRONG_INLINE Packet4bf pabs(
const Packet4bf& a) {
4737 return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
4741 EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4742 return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4745 EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4746 return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4750 EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4751 return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4755 EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4756 return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4759 EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4760 return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4764 EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4765 return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4769 EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(
const bfloat16& a) {
4770 return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
4774 EIGEN_STRONG_INLINE Packet4bf por(
const Packet4bf& a,
const Packet4bf& b) {
4775 return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
4779 EIGEN_STRONG_INLINE Packet4bf pxor(
const Packet4bf& a,
const Packet4bf& b) {
4780 return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
4784 EIGEN_STRONG_INLINE Packet4bf pand(
const Packet4bf& a,
const Packet4bf& b) {
4785 return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
4789 EIGEN_STRONG_INLINE Packet4bf pandnot(
const Packet4bf& a,
const Packet4bf& b) {
4790 return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
4794 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(
const Packet4bf& mask,
const Packet4bf& a,
const Packet4bf& b) {
4795 return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
4799 EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(
const Packet4bf& a) {
4800 return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
4804 EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(
const Packet4bf& a) {
4805 return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
4809 EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(
const Packet4bf& a) {
4810 return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
4814 EIGEN_STRONG_INLINE Packet4bf pround<Packet4bf>(
const Packet4bf& a) {
4815 return F32ToBf16(pround<Packet4f>(Bf16ToF32(a)));
4819 EIGEN_STRONG_INLINE Packet4bf ptrunc<Packet4bf>(
const Packet4bf& a) {
4820 return F32ToBf16(ptrunc<Packet4f>(Bf16ToF32(a)));
4824 EIGEN_STRONG_INLINE Packet4bf pconj(
const Packet4bf& a) {
4829 EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4830 return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4834 EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4835 return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4839 EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4840 return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4844 EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b,
const Packet4bf& c) {
4845 return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4849 EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b,
const Packet4bf& c) {
4850 return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4854 EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b,
const Packet4bf& c) {
4855 return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4859 EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b,
const Packet4bf& c) {
4860 return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4864 EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4865 return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4869 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(
const bfloat16* from,
Index stride) {
4870 return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
4874 EIGEN_STRONG_INLINE
void pscatter<bfloat16, Packet4bf>(bfloat16* to,
const Packet4bf& from,
Index stride) {
4875 pscatter<uint16_t, Packet4us>(
reinterpret_cast<uint16_t*
>(to), Packet4us(from), stride);
4879 EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(
const Packet4bf& a) {
4880 return static_cast<bfloat16
>(predux<Packet4f>(Bf16ToF32(a)));
4884 EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(
const Packet4bf& a) {
4885 return static_cast<bfloat16
>(predux_max<Packet4f>(Bf16ToF32(a)));
4889 EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(
const Packet4bf& a) {
4890 return static_cast<bfloat16
>(predux_min<Packet4f>(Bf16ToF32(a)));
4894 EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(
const Packet4bf& a) {
4895 return static_cast<bfloat16
>(predux_mul<Packet4f>(Bf16ToF32(a)));
4899 EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(
const Packet4bf& a) {
4900 return Packet4bf(preverse<Packet4us>(Packet4us(a)));
4903 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
4904 detail::ptranspose_impl(kernel);
4908 EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4909 return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4913 EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4914 return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4918 EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4919 return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4923 EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4924 return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4928 EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
4929 return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4933 EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(
const Packet4bf& a) {
4934 return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
4941 #if EIGEN_COMP_CLANGAPPLE 4945 #define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000) 4947 #define EIGEN_APPLE_DOUBLE_NEON_BUG 0 4950 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG 4958 template <
typename T>
4959 uint64x2_t vreinterpretq_u64_f64(T a) {
4960 return (uint64x2_t)a;
4963 template <
typename T>
4964 float64x2_t vreinterpretq_f64_u64(T a) {
4965 return (float64x2_t)a;
4969 #if EIGEN_COMP_MSVC_STRICT 4970 typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
4971 typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
4973 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(
double a,
double b) {
4974 double from[2] = {a, b};
4975 return vld1q_f64(from);
4979 typedef float64x2_t Packet2d;
4980 typedef float64x1_t Packet1d;
4982 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(
double a,
double b) {
return Packet2d{a, b}; }
4988 EIGEN_STRONG_INLINE Packet2d shuffle(
const Packet2d& m,
const Packet2d& n,
int mask) {
4989 const double* a =
reinterpret_cast<const double*
>(&m);
4990 const double* b =
reinterpret_cast<const double*
>(&n);
4991 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
4995 EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(
const Packet2d& a,
const Packet2d& b,
int mask) {
4996 return shuffle(a, b, mask);
4998 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(
const Packet2d& a,
const Packet2d& b) {
return shuffle(a, b, 0); }
4999 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(
const Packet2d& a,
const Packet2d& b) {
return shuffle(a, b, 3); }
5000 #define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p)) 5003 struct packet_traits<double> : default_packet_traits {
5004 typedef Packet2d type;
5005 typedef Packet2d half;
5008 AlignedOnScalar = 1,
5029 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG 5036 HasSin = EIGEN_FAST_MATH,
5037 HasCos = EIGEN_FAST_MATH,
5041 HasTanh = EIGEN_FAST_MATH,
5042 HasErf = EIGEN_FAST_MATH,
5043 HasErfc = EIGEN_FAST_MATH
5048 struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
5049 using integer_packet = Packet2l;
5053 EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(
const Packet2d& ) {
5054 return vdupq_n_f64(0.0);
5058 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
5059 return vdupq_n_f64(from);
5063 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
5064 const double c[] = {0.0, 1.0};
5065 return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
5069 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5070 return vaddq_f64(a, b);
5074 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5075 return vsubq_f64(a, b);
5079 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d&,
const Packet2d&);
5081 EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5082 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
5083 return padd(a, pxor(mask, b));
5087 EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
5088 return vnegq_f64(a);
5092 EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
5097 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5098 return vmulq_f64(a, b);
5102 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5103 return vdivq_f64(a, b);
5106 #ifdef EIGEN_VECTORIZE_FMA 5109 EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
5110 return vfmaq_f64(c, a, b);
5113 EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
5114 return vfmsq_f64(c, a, b);
5118 EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
5119 return vmlaq_f64(c, a, b);
5122 EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
5123 return vmlsq_f64(c, a, b);
5127 EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
5128 return pnegate(pnmadd(a, b, c));
5131 EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
5132 return pnegate(pmadd(a, b, c));
5135 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5136 return vminq_f64(a, b);
5139 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN 5143 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5144 return vminnmq_f64(a, b);
5147 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5148 return vmaxnmq_f64(a, b);
5154 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5155 return pmin<Packet2d>(a, b);
5159 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5160 return vmaxq_f64(a, b);
5164 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5165 return pmax<Packet2d>(a, b);
5170 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5171 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5175 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5176 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5180 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5181 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5185 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
5186 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5190 EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b) {
5191 return vreinterpretq_f64_u64(vcleq_f64(a, b));
5195 EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b) {
5196 return vreinterpretq_f64_u64(vcltq_f64(a, b));
5200 EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b) {
5201 return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
5205 EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b) {
5206 return vreinterpretq_f64_u64(vceqq_f64(a, b));
5210 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
5211 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(from));
5215 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
5216 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f64(from);
5220 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
5221 return vld1q_dup_f64(from);
5224 EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
5225 EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(to), from);
5229 EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
5230 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
5234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
5235 Packet2d res = pset1<Packet2d>(0.0);
5236 res = vld1q_lane_f64(from + 0 * stride, res, 0);
5237 res = vld1q_lane_f64(from + 1 * stride, res, 1);
5242 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
5243 vst1q_lane_f64(to + stride * 0, from, 0);
5244 vst1q_lane_f64(to + stride * 1, from, 1);
5248 EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
5249 EIGEN_ARM_PREFETCH(addr);
5254 EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
5255 return vgetq_lane_f64(a, 0);
5259 EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
5260 return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
5264 EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
5265 return vabsq_f64(a);
5269 EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
5270 return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
5274 EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
5275 return vaddvq_f64(a);
5280 #if EIGEN_COMP_CLANGAPPLE 5282 EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
5283 return (vget_low_f64(a) * vget_high_f64(a))[0];
5287 EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
5288 return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
5294 EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
5295 return vminvq_f64(a);
5300 EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
5301 return vmaxvq_f64(a);
5304 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
5305 const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
5306 const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
5308 kernel.packet[0] = tmp1;
5309 kernel.packet[1] = tmp2;
5313 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(
const Packet2d& mask,
const Packet2d& a,
const Packet2d& b) {
5314 return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
5318 EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
5319 return vrndnq_f64(a);
5323 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
5324 return vrndmq_f64(a);
5328 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
5329 return vrndpq_f64(a);
5333 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
5334 return vrndaq_f64(a);
5338 EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
5339 return vrndq_f64(a);
5343 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
5344 return pldexp_generic(a, exponent);
5348 EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
5349 return pfrexp_generic(a, exponent);
5353 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
5354 return vreinterpretq_f64_u64(vdupq_n_u64(from));
5358 EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
5360 return generic_rsqrt_newton_step<Packet2d, 3>::run(a, vrsqrteq_f64(a));
5364 EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& _x) {
5365 return vsqrtq_f64(_x);
5368 #endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG 5371 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 5372 typedef float16x4_t Packet4hf;
5373 typedef float16x8_t Packet8hf;
5376 struct packet_traits<
Eigen::half> : default_packet_traits {
5377 typedef Packet8hf type;
5378 typedef Packet4hf half;
5381 AlignedOnScalar = 1,
5407 HasTanh = packet_traits<float>::HasTanh,
5410 HasErf = EIGEN_FAST_MATH,
5417 struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
5419 struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
5420 using half = Packet4hf;
5424 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(
const Packet8hf& a) {
5425 return vadd_f16(vget_low_f16(a), vget_high_f16(a));
5429 EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(
const Eigen::half& from) {
5430 return vdupq_n_f16(from.x);
5434 EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(
const Eigen::half& from) {
5435 return vdup_n_f16(from.x);
5439 EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(
const Eigen::half& a) {
5440 const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
5441 Packet8hf countdown = vld1q_f16(f);
5442 return vaddq_f16(pset1<Packet8hf>(a), countdown);
5446 EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(
const Eigen::half& a) {
5447 const float16_t f[] = {0, 1, 2, 3};
5448 Packet4hf countdown = vld1_f16(f);
5449 return vadd_f16(pset1<Packet4hf>(a), countdown);
5453 EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5454 return vaddq_f16(a, b);
5458 EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5459 return vadd_f16(a, b);
5463 EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5464 return vsubq_f16(a, b);
5468 EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5469 return vsub_f16(a, b);
5473 EIGEN_STRONG_INLINE Packet8hf pnegate(
const Packet8hf& a) {
5474 return vnegq_f16(a);
5478 EIGEN_STRONG_INLINE Packet4hf pnegate(
const Packet4hf& a) {
5483 EIGEN_STRONG_INLINE Packet8hf pconj(
const Packet8hf& a) {
5488 EIGEN_STRONG_INLINE Packet4hf pconj(
const Packet4hf& a) {
5493 EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5494 return vmulq_f16(a, b);
5498 EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5499 return vmul_f16(a, b);
5503 EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5504 return vdivq_f16(a, b);
5508 EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5509 return vdiv_f16(a, b);
5513 EIGEN_STRONG_INLINE Packet8hf pmadd(
const Packet8hf& a,
const Packet8hf& b,
const Packet8hf& c) {
5514 return vfmaq_f16(c, a, b);
5518 EIGEN_STRONG_INLINE Packet4hf pmadd(
const Packet4hf& a,
const Packet4hf& b,
const Packet4hf& c) {
5519 return vfma_f16(c, a, b);
5523 EIGEN_STRONG_INLINE Packet8hf pnmadd(
const Packet8hf& a,
const Packet8hf& b,
const Packet8hf& c) {
5524 return vfmsq_f16(c, a, b);
5528 EIGEN_STRONG_INLINE Packet4hf pnmadd(
const Packet4hf& a,
const Packet4hf& b,
const Packet4hf& c) {
5529 return vfms_f16(c, a, b);
5533 EIGEN_STRONG_INLINE Packet8hf pmsub(
const Packet8hf& a,
const Packet8hf& b,
const Packet8hf& c) {
5534 return pnegate(pnmadd(a, b, c));
5538 EIGEN_STRONG_INLINE Packet4hf pmsub(
const Packet4hf& a,
const Packet4hf& b,
const Packet4hf& c) {
5539 return pnegate(pnmadd(a, b, c));
5543 EIGEN_STRONG_INLINE Packet8hf pnmsub(
const Packet8hf& a,
const Packet8hf& b,
const Packet8hf& c) {
5544 return pnegate(pmadd(a, b, c));
5548 EIGEN_STRONG_INLINE Packet4hf pnmsub(
const Packet4hf& a,
const Packet4hf& b,
const Packet4hf& c) {
5549 return pnegate(pmadd(a, b, c));
5553 EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5554 return vminq_f16(a, b);
5558 EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5559 return vmin_f16(a, b);
5562 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN 5566 EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5567 return vminnm_f16(a, b);
5570 EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5571 return vminnmq_f16(a, b);
5576 EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5577 return pmin<Packet4hf>(a, b);
5581 EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5582 return pmin<Packet8hf>(a, b);
5586 EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5587 return vmaxq_f16(a, b);
5591 EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5592 return vmax_f16(a, b);
5595 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN 5599 EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5600 return vmaxnm_f16(a, b);
5603 EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5604 return vmaxnmq_f16(a, b);
5609 EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5610 return pmax<Packet4hf>(a, b);
5614 EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5615 return pmax<Packet8hf>(a, b);
5618 #define EIGEN_MAKE_ARM_FP16_CMP_8(name) \ 5620 EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \ 5621 return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \ 5624 #define EIGEN_MAKE_ARM_FP16_CMP_4(name) \ 5626 EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \ 5627 return vreinterpret_f16_u16(vc##name##_f16(a, b)); \ 5630 EIGEN_MAKE_ARM_FP16_CMP_8(eq)
5631 EIGEN_MAKE_ARM_FP16_CMP_8(lt)
5632 EIGEN_MAKE_ARM_FP16_CMP_8(le)
5634 EIGEN_MAKE_ARM_FP16_CMP_4(eq)
5635 EIGEN_MAKE_ARM_FP16_CMP_4(lt)
5636 EIGEN_MAKE_ARM_FP16_CMP_4(le)
5638 #undef EIGEN_MAKE_ARM_FP16_CMP_8 5639 #undef EIGEN_MAKE_ARM_FP16_CMP_4 5642 EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5643 return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
5647 EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5648 return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
5652 EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(
const Packet8hf& a) {
5653 return vrndnq_f16(a);
5657 EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(
const Packet4hf& a) {
5658 return vrndn_f16(a);
5662 EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(
const Packet8hf& a) {
5663 return vrndmq_f16(a);
5667 EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(
const Packet4hf& a) {
5668 return vrndm_f16(a);
5672 EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(
const Packet8hf& a) {
5673 return vrndpq_f16(a);
5677 EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(
const Packet4hf& a) {
5678 return vrndp_f16(a);
5682 EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(
const Packet8hf& a) {
5683 return vrndaq_f16(a);
5687 EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(
const Packet4hf& a) {
5688 return vrnda_f16(a);
5692 EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(
const Packet8hf& a) {
5693 return vrndq_f16(a);
5697 EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(
const Packet4hf& a) {
5702 EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(
const Packet8hf& a) {
5703 return vsqrtq_f16(a);
5707 EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(
const Packet4hf& a) {
5708 return vsqrt_f16(a);
5712 EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5713 return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5717 EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5718 return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5722 EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5723 return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5727 EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5728 return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5732 EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5733 return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5737 EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5738 return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5742 EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
5743 return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5747 EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
5748 return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5752 EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(
const Eigen::half* from) {
5753 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f16(
5754 reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(from)));
5758 EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(
const Eigen::half* from) {
5759 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_f16(
5760 reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(from)));
5764 EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(
const Eigen::half* from) {
5765 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5769 EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(
const Eigen::half* from) {
5770 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_f16(reinterpret_cast<const float16_t*>(from));
5774 EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(
const Eigen::half* from) {
5776 packet[0] = from[0].x;
5777 packet[1] = from[0].x;
5778 packet[2] = from[1].x;
5779 packet[3] = from[1].x;
5780 packet[4] = from[2].x;
5781 packet[5] = from[2].x;
5782 packet[6] = from[3].x;
5783 packet[7] = from[3].x;
5788 EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(
const Eigen::half* from) {
5791 tmp = (float16_t*)&packet;
5800 EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(
const Eigen::half* from) {
5802 lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
5803 hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
5804 return vcombine_f16(lo, hi);
5807 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(
const Packet8hf& a, Eigen::half b) {
5808 return vsetq_lane_f16(b.x, a, 0);
5811 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(
const Packet4hf& a, Eigen::half b) {
5812 return vset_lane_f16(b.x, a, 0);
5816 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(
const Packet8hf& mask,
const Packet8hf& a,
const Packet8hf& b) {
5817 return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
5821 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(
const Packet4hf& mask,
const Packet4hf& a,
const Packet4hf& b) {
5822 return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
5825 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(
const Packet8hf& a, Eigen::half b) {
5826 return vsetq_lane_f16(b.x, a, 7);
5829 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(
const Packet4hf& a, Eigen::half b) {
5830 return vset_lane_f16(b.x, a, 3);
5834 EIGEN_STRONG_INLINE
void pstore<Eigen::half>(Eigen::half* to,
const Packet8hf& from) {
5835 EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(
5836 reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(to)), from);
5840 EIGEN_STRONG_INLINE
void pstore<Eigen::half>(Eigen::half* to,
const Packet4hf& from) {
5841 EIGEN_DEBUG_ALIGNED_STORE vst1_f16(
5842 reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(to)), from);
5846 EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(Eigen::half* to,
const Packet8hf& from) {
5847 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5851 EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(Eigen::half* to,
const Packet4hf& from) {
5852 EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5856 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(
const Eigen::half* from,
Index stride) {
5857 Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
5858 res = vsetq_lane_f16(from[0 * stride].x, res, 0);
5859 res = vsetq_lane_f16(from[1 * stride].x, res, 1);
5860 res = vsetq_lane_f16(from[2 * stride].x, res, 2);
5861 res = vsetq_lane_f16(from[3 * stride].x, res, 3);
5862 res = vsetq_lane_f16(from[4 * stride].x, res, 4);
5863 res = vsetq_lane_f16(from[5 * stride].x, res, 5);
5864 res = vsetq_lane_f16(from[6 * stride].x, res, 6);
5865 res = vsetq_lane_f16(from[7 * stride].x, res, 7);
5870 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(
const Eigen::half* from,
Index stride) {
5871 Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
5872 res = vset_lane_f16(from[0 * stride].x, res, 0);
5873 res = vset_lane_f16(from[1 * stride].x, res, 1);
5874 res = vset_lane_f16(from[2 * stride].x, res, 2);
5875 res = vset_lane_f16(from[3 * stride].x, res, 3);
5880 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet8hf>(Eigen::half* to,
const Packet8hf& from,
5882 to[stride * 0].x = vgetq_lane_f16(from, 0);
5883 to[stride * 1].x = vgetq_lane_f16(from, 1);
5884 to[stride * 2].x = vgetq_lane_f16(from, 2);
5885 to[stride * 3].x = vgetq_lane_f16(from, 3);
5886 to[stride * 4].x = vgetq_lane_f16(from, 4);
5887 to[stride * 5].x = vgetq_lane_f16(from, 5);
5888 to[stride * 6].x = vgetq_lane_f16(from, 6);
5889 to[stride * 7].x = vgetq_lane_f16(from, 7);
5893 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4hf>(Eigen::half* to,
const Packet4hf& from,
5895 to[stride * 0].x = vget_lane_f16(from, 0);
5896 to[stride * 1].x = vget_lane_f16(from, 1);
5897 to[stride * 2].x = vget_lane_f16(from, 2);
5898 to[stride * 3].x = vget_lane_f16(from, 3);
5902 EIGEN_STRONG_INLINE
void prefetch<Eigen::half>(
const Eigen::half* addr) {
5903 EIGEN_ARM_PREFETCH(addr);
5907 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(
const Packet8hf& a) {
5916 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(
const Packet4hf& a) {
5925 EIGEN_STRONG_INLINE Packet8hf preverse(
const Packet8hf& a) {
5926 float16x4_t a_lo, a_hi;
5929 a_r64 = vrev64q_f16(a);
5930 a_lo = vget_low_f16(a_r64);
5931 a_hi = vget_high_f16(a_r64);
5932 return vcombine_f16(a_hi, a_lo);
5936 EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(
const Packet4hf& a) {
5937 return vrev64_f16(a);
5941 EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(
const Packet8hf& a) {
5942 return vabsq_f16(a);
5946 EIGEN_STRONG_INLINE Packet8hf psignbit(
const Packet8hf& a) {
5947 return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
5951 EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(
const Packet4hf& a) {
5956 EIGEN_STRONG_INLINE Packet4hf psignbit(
const Packet4hf& a) {
5957 return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
5961 EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(
const Packet8hf& a) {
5962 float16x4_t a_lo, a_hi, sum;
5964 a_lo = vget_low_f16(a);
5965 a_hi = vget_high_f16(a);
5966 sum = vpadd_f16(a_lo, a_hi);
5967 sum = vpadd_f16(sum, sum);
5968 sum = vpadd_f16(sum, sum);
5971 h.x = vget_lane_f16(sum, 0);
5976 EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(
const Packet4hf& a) {
5979 sum = vpadd_f16(a, a);
5980 sum = vpadd_f16(sum, sum);
5982 h.x = vget_lane_f16(sum, 0);
5987 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(
const Packet8hf& a) {
5988 float16x4_t a_lo, a_hi, prod;
5990 a_lo = vget_low_f16(a);
5991 a_hi = vget_high_f16(a);
5992 prod = vmul_f16(a_lo, a_hi);
5993 prod = vmul_f16(prod, vrev64_f16(prod));
5996 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6001 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(
const Packet4hf& a) {
6003 prod = vmul_f16(a, vrev64_f16(a));
6005 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6010 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(
const Packet8hf& a) {
6012 h.x = vminvq_f16(a);
6017 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(
const Packet4hf& a) {
6024 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(
const Packet8hf& a) {
6026 h.x = vmaxvq_f16(a);
6031 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(
const Packet4hf& a) {
6037 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
6038 const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
6039 const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
6041 const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
6042 const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
6044 kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
6045 kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
6046 kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
6047 kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
6050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
6051 EIGEN_ALIGN16 float16x4x4_t tmp_x4;
6052 float16_t* tmp = (float16_t*)&kernel;
6053 tmp_x4 = vld4_f16(tmp);
6055 kernel.packet[0] = tmp_x4.val[0];
6056 kernel.packet[1] = tmp_x4.val[1];
6057 kernel.packet[2] = tmp_x4.val[2];
6058 kernel.packet[3] = tmp_x4.val[3];
6061 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
6062 float16x8x2_t T_1[4];
6064 T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
6065 T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
6066 T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
6067 T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
6069 float16x8x2_t T_2[4];
6070 T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
6071 T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
6072 T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
6073 T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
6075 float16x8x2_t T_3[4];
6076 T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
6077 T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
6078 T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
6079 T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
6081 kernel.packet[0] = T_3[0].val[0];
6082 kernel.packet[1] = T_3[2].val[0];
6083 kernel.packet[2] = T_3[1].val[0];
6084 kernel.packet[3] = T_3[3].val[0];
6085 kernel.packet[4] = T_3[0].val[1];
6086 kernel.packet[5] = T_3[2].val[1];
6087 kernel.packet[6] = T_3[1].val[1];
6088 kernel.packet[7] = T_3[3].val[1];
6090 #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 6096 #endif // EIGEN_PACKET_MATH_NEON_H Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82