13 #ifndef EIGEN_PACKET_MATH_MSA_H 14 #define EIGEN_PACKET_MATH_MSA_H 20 #include "../../InternalHeaderCheck.h" 26 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 27 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 30 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 31 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 34 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 35 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 39 #define EIGEN_MSA_DEBUG \ 40 static bool firstTime = true; \ 43 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ 48 #define EIGEN_MSA_DEBUG 51 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) 53 typedef v4f32 Packet4f;
54 typedef v4i32 Packet4i;
55 typedef v4u32 Packet4ui;
57 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X} 58 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X} 59 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X} 61 inline std::ostream& operator<<(std::ostream& os,
const Packet4f& value) {
62 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
66 inline std::ostream& operator<<(std::ostream& os,
const Packet4i& value) {
67 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
71 inline std::ostream& operator<<(std::ostream& os,
const Packet4ui& value) {
72 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
77 struct packet_traits<float> : default_packet_traits {
78 typedef Packet4f type;
79 typedef Packet4f half;
86 HasSin = EIGEN_FAST_MATH,
87 HasCos = EIGEN_FAST_MATH,
88 HasTanh = EIGEN_FAST_MATH,
89 HasErf = EIGEN_FAST_MATH,
99 struct packet_traits<int32_t> : default_packet_traits {
100 typedef Packet4i type;
101 typedef Packet4i half;
113 struct unpacket_traits<Packet4f> {
119 masked_load_available =
false,
120 masked_store_available =
false 122 typedef Packet4f half;
126 struct unpacket_traits<Packet4i> {
127 typedef int32_t type;
132 masked_load_available =
false,
133 masked_store_available =
false 135 typedef Packet4i half;
139 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
142 Packet4f v = {from, from, from, from};
147 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
150 return __builtin_msa_fill_w(from);
154 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(
const float* from) {
158 Packet4f v = {f, f, f, f};
163 EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(
const int32_t* from) {
166 return __builtin_msa_fill_w(*from);
170 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
173 return __builtin_msa_fadd_w(a, b);
177 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
180 return __builtin_msa_addv_w(a, b);
184 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
187 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
188 return padd(pset1<Packet4f>(a), countdown);
192 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
195 static const Packet4i countdown = {0, 1, 2, 3};
196 return padd(pset1<Packet4i>(a), countdown);
200 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
203 return __builtin_msa_fsub_w(a, b);
207 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
210 return __builtin_msa_subv_w(a, b);
214 EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
217 return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
221 EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
224 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
228 EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
235 EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
242 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
245 return __builtin_msa_fmul_w(a, b);
249 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
252 return __builtin_msa_mulv_w(a, b);
256 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
259 return __builtin_msa_fdiv_w(a, b);
263 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
266 return __builtin_msa_div_s_w(a, b);
270 EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
273 return __builtin_msa_fmadd_w(c, a, b);
277 EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
282 __asm__(
"maddv.w %w[value], %w[a], %w[b]\n" 284 : [value]
"+f"(value)
286 : [a]
"f"(a), [b]
"f"(b));
291 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
294 return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
298 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
301 return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
305 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
308 return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
312 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
315 return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
319 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
322 return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
326 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
329 return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
333 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
336 return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
340 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
343 return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
347 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
352 return __builtin_msa_fmin_w(a, b);
355 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
356 Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
357 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
362 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
365 return __builtin_msa_min_s_w(a, b);
369 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
374 return __builtin_msa_fmax_w(a, b);
377 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
378 Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
379 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
384 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
387 return __builtin_msa_max_s_w(a, b);
391 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
394 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
398 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
401 EIGEN_DEBUG_ALIGNED_LOAD
return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
405 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
408 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
412 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
415 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
419 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
422 float f0 = from[0], f1 = from[1];
423 Packet4f v0 = {f0, f0, f0, f0};
424 Packet4f v1 = {f1, f1, f1, f1};
425 return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
429 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
432 int32_t i0 = from[0], i1 = from[1];
433 Packet4i v0 = {i0, i0, i0, i0};
434 Packet4i v1 = {i1, i1, i1, i1};
435 return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
439 EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
442 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
446 EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
449 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
453 EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
456 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
460 EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
463 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
467 EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
471 Packet4f v = {f, f, f, f};
473 v[2] = from[2 * stride];
474 v[3] = from[3 * stride];
479 EIGEN_DEVICE_FUNC
inline Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride) {
483 Packet4i v = {i, i, i, i};
485 v[2] = from[2 * stride];
486 v[3] = from[3 * stride];
491 EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
504 EIGEN_DEVICE_FUNC
inline void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
Index stride) {
517 EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
520 __builtin_prefetch(addr);
524 EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
527 __builtin_prefetch(addr);
531 EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
538 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
545 EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
548 return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
552 EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
555 return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
559 EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
562 return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
566 EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
569 Packet4i zero = __builtin_msa_ldi_w(0);
570 return __builtin_msa_add_a_w(zero, a);
574 EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
577 Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
578 s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
583 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
586 Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
587 s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
594 EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
597 Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
598 p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
603 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
606 Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
607 p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
613 EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
617 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
621 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
623 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
626 Packet4f v = __builtin_msa_fmin_w(a, swapped);
627 v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
630 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
631 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
637 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
640 Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
641 m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
647 EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
651 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
655 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
657 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
660 Packet4f v = __builtin_msa_fmax_w(a, swapped);
661 v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
664 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
665 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
671 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
674 Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
675 m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
679 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
680 os <<
"[ " << value.packet[0] <<
"," << std::endl
681 <<
" " << value.packet[1] <<
"," << std::endl
682 <<
" " << value.packet[2] <<
"," << std::endl
683 <<
" " << value.packet[3] <<
" ]";
687 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
690 v4i32 tmp1, tmp2, tmp3, tmp4;
692 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
693 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
694 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
695 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
697 kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
698 kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
699 kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
700 kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
703 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
704 os <<
"[ " << value.packet[0] <<
"," << std::endl
705 <<
" " << value.packet[1] <<
"," << std::endl
706 <<
" " << value.packet[2] <<
"," << std::endl
707 <<
" " << value.packet[3] <<
" ]";
711 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
714 v4i32 tmp1, tmp2, tmp3, tmp4;
716 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
717 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
718 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
719 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
721 kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
722 kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
723 kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
724 kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
728 EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
731 return __builtin_msa_fsqrt_w(a);
735 EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
739 return __builtin_msa_frsqrt_w(a);
741 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
742 return pdiv(ones, psqrt(a));
747 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
749 int32_t old_mode, new_mode;
751 "cfcmsa %[old_mode], $1\n" 752 "ori %[new_mode], %[old_mode], 3\n" 753 "ctcmsa $1, %[new_mode]\n" 754 "frint.w %w[v], %w[v]\n" 755 "ctcmsa $1, %[old_mode]\n" 757 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
766 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
768 int32_t old_mode, new_mode;
770 "cfcmsa %[old_mode], $1\n" 771 "ori %[new_mode], %[old_mode], 3\n" 772 "xori %[new_mode], %[new_mode], 1\n" 773 "ctcmsa $1, %[new_mode]\n" 774 "frint.w %w[v], %w[v]\n" 775 "ctcmsa $1, %[old_mode]\n" 777 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
786 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
788 int32_t old_mode, new_mode;
790 "cfcmsa %[old_mode], $1\n" 791 "ori %[new_mode], %[old_mode], 3\n" 792 "xori %[new_mode], %[new_mode], 3\n" 793 "ctcmsa $1, %[new_mode]\n" 794 "frint.w %w[v], %w[v]\n" 795 "ctcmsa $1, %[old_mode]\n" 797 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
806 EIGEN_STRONG_INLINE Packet4f pblend(
const Selector<4>& ifPacket,
const Packet4f& thenPacket,
807 const Packet4f& elsePacket) {
808 Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
809 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
810 return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
814 EIGEN_STRONG_INLINE Packet4i pblend(
const Selector<4>& ifPacket,
const Packet4i& thenPacket,
815 const Packet4i& elsePacket) {
816 Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
817 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
818 return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
823 typedef v2f64 Packet2d;
824 typedef v2i64 Packet2l;
825 typedef v2u64 Packet2ul;
827 #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X} 828 #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X} 829 #define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X} 831 inline std::ostream& operator<<(std::ostream& os,
const Packet2d& value) {
832 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
836 inline std::ostream& operator<<(std::ostream& os,
const Packet2l& value) {
837 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
841 inline std::ostream& operator<<(std::ostream& os,
const Packet2ul& value) {
842 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
847 struct packet_traits<double> : default_packet_traits {
848 typedef Packet2d type;
849 typedef Packet2d half;
864 struct unpacket_traits<Packet2d> {
870 masked_load_available =
false,
871 masked_store_available =
false 873 typedef Packet2d half;
877 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
880 Packet2d value = {from, from};
885 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
888 return __builtin_msa_fadd_d(a, b);
892 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
895 static const Packet2d countdown = {0.0, 1.0};
896 return padd(pset1<Packet2d>(a), countdown);
900 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
903 return __builtin_msa_fsub_d(a, b);
907 EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
910 return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
914 EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
921 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
924 return __builtin_msa_fmul_d(a, b);
928 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
931 return __builtin_msa_fdiv_d(a, b);
935 EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
938 return __builtin_msa_fmadd_d(c, a, b);
944 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
947 return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
951 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
954 return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
958 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
961 return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
965 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
968 return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
972 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
975 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
979 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
984 return __builtin_msa_fmin_d(a, b);
987 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
988 v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
989 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
994 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
999 return __builtin_msa_fmax_d(a, b);
1002 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
1003 v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
1004 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
1009 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1012 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
1016 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1019 Packet2d value = {*from, *from};
1024 EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1027 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1031 EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1034 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1038 EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1049 EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
1058 EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1061 __builtin_prefetch(addr);
1065 EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1072 EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1075 return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1079 EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1082 return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1086 EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
1089 Packet2d s = padd(a, preverse(a));
1096 EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
1099 Packet2d p = pmul(a, preverse(a));
1105 EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
1109 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1110 Packet2d v = __builtin_msa_fmin_d(a, swapped);
1113 double a0 = a[0], a1 = a[1];
1114 return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1120 EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
1124 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1125 Packet2d v = __builtin_msa_fmax_d(a, swapped);
1128 double a0 = a[0], a1 = a[1];
1129 return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1134 EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& a) {
1137 return __builtin_msa_fsqrt_d(a);
1141 EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
1145 return __builtin_msa_frsqrt_d(a);
1147 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1148 return pdiv(ones, psqrt(a));
1152 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
1153 os <<
"[ " << value.packet[0] <<
"," << std::endl <<
" " << value.packet[1] <<
" ]";
1157 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1160 Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1161 Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1162 kernel.packet[0] = trn1;
1163 kernel.packet[1] = trn2;
1167 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
1169 int32_t old_mode, new_mode;
1171 "cfcmsa %[old_mode], $1\n" 1172 "ori %[new_mode], %[old_mode], 3\n" 1173 "ctcmsa $1, %[new_mode]\n" 1174 "frint.d %w[v], %w[v]\n" 1175 "ctcmsa $1, %[old_mode]\n" 1177 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1186 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
1188 int32_t old_mode, new_mode;
1190 "cfcmsa %[old_mode], $1\n" 1191 "ori %[new_mode], %[old_mode], 3\n" 1192 "xori %[new_mode], %[new_mode], 1\n" 1193 "ctcmsa $1, %[new_mode]\n" 1194 "frint.d %w[v], %w[v]\n" 1195 "ctcmsa $1, %[old_mode]\n" 1197 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1206 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
1208 int32_t old_mode, new_mode;
1210 "cfcmsa %[old_mode], $1\n" 1211 "ori %[new_mode], %[old_mode], 3\n" 1212 "xori %[new_mode], %[new_mode], 3\n" 1213 "ctcmsa $1, %[new_mode]\n" 1214 "frint.d %w[v], %w[v]\n" 1215 "ctcmsa $1, %[old_mode]\n" 1217 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1226 EIGEN_STRONG_INLINE Packet2d pblend(
const Selector<2>& ifPacket,
const Packet2d& thenPacket,
1227 const Packet2d& elsePacket) {
1228 Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
1229 Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
1230 return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
1237 #endif // EIGEN_PACKET_MATH_MSA_H Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82