10 #ifndef EIGEN_PACKET_MATH_SVE_H 11 #define EIGEN_PACKET_MATH_SVE_H 14 #include "../../InternalHeaderCheck.h" 18 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 19 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 22 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 23 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 26 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 28 template <
typename Scalar,
int SVEVectorLength>
29 struct sve_packet_size_selector {
30 enum { size = SVEVectorLength / (
sizeof(Scalar) * CHAR_BIT) };
34 typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
37 struct packet_traits<numext::int32_t> : default_packet_traits {
38 typedef PacketXi type;
39 typedef PacketXi half;
43 size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
63 struct unpacket_traits<PacketXi> {
64 typedef numext::int32_t type;
65 typedef PacketXi half;
67 size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
70 masked_load_available =
false,
71 masked_store_available =
false 76 EIGEN_STRONG_INLINE
void prefetch<numext::int32_t>(
const numext::int32_t* addr) {
77 svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
81 EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(
const numext::int32_t& from) {
82 return svdup_n_s32(from);
86 EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(
const numext::int32_t& a) {
87 numext::int32_t c[packet_traits<numext::int32_t>::size];
88 for (
int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
89 return svadd_s32_x(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
93 EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
94 return svadd_s32_x(svptrue_b32(), a, b);
98 EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
99 return svsub_s32_x(svptrue_b32(), a, b);
103 EIGEN_STRONG_INLINE PacketXi pnegate(
const PacketXi& a) {
104 return svneg_s32_x(svptrue_b32(), a);
108 EIGEN_STRONG_INLINE PacketXi pconj(
const PacketXi& a) {
113 EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
114 return svmul_s32_x(svptrue_b32(), a, b);
118 EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
119 return svdiv_s32_x(svptrue_b32(), a, b);
123 EIGEN_STRONG_INLINE PacketXi pmadd(
const PacketXi& a,
const PacketXi& b,
const PacketXi& c) {
124 return svmla_s32_x(svptrue_b32(), c, a, b);
128 EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
129 return svmin_s32_x(svptrue_b32(), a, b);
133 EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
134 return svmax_s32_x(svptrue_b32(), a, b);
138 EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
139 return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu);
143 EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
144 return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
148 EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
149 return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
153 EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(
const PacketXi& ) {
154 return svdup_n_s32_x(svptrue_b32(), 0xffffffffu);
158 EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(
const PacketXi& ) {
159 return svdup_n_s32_x(svptrue_b32(), 0);
163 EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
164 return svand_s32_x(svptrue_b32(), a, b);
168 EIGEN_STRONG_INLINE PacketXi por<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
169 return svorr_s32_x(svptrue_b32(), a, b);
173 EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
174 return sveor_s32_x(svptrue_b32(), a, b);
178 EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(
const PacketXi& a,
const PacketXi& b) {
179 return svbic_s32_x(svptrue_b32(), a, b);
183 EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) {
184 return svasrd_n_s32_x(svptrue_b32(), a, N);
188 EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) {
189 return svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), svreinterpret_u32_s32(a), N));
193 EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) {
194 return svlsl_n_s32_x(svptrue_b32(), a, N);
198 EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(
const numext::int32_t* from) {
199 EIGEN_DEBUG_ALIGNED_LOAD
return svld1_s32(svptrue_b32(), from);
203 EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(
const numext::int32_t* from) {
204 EIGEN_DEBUG_UNALIGNED_LOAD
return svld1_s32(svptrue_b32(), from);
208 EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(
const numext::int32_t* from) {
209 svuint32_t indices = svindex_u32(0, 1);
210 indices = svzip1_u32(indices, indices);
211 return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
215 EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(
const numext::int32_t* from) {
216 svuint32_t indices = svindex_u32(0, 1);
217 indices = svzip1_u32(indices, indices);
218 indices = svzip1_u32(indices, indices);
219 return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
223 EIGEN_STRONG_INLINE
void pstore<numext::int32_t>(numext::int32_t* to,
const PacketXi& from) {
224 EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
228 EIGEN_STRONG_INLINE
void pstoreu<numext::int32_t>(numext::int32_t* to,
const PacketXi& from) {
229 EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
233 EIGEN_DEVICE_FUNC
inline PacketXi pgather<numext::int32_t, PacketXi>(
const numext::int32_t* from,
Index stride) {
235 svint32_t indices = svindex_s32(0, stride);
236 return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
240 EIGEN_DEVICE_FUNC
inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to,
const PacketXi& from,
243 svint32_t indices = svindex_s32(0, stride);
244 svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
248 EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(
const PacketXi& a) {
250 return svlasta_s32(svpfalse_b(), a);
254 EIGEN_STRONG_INLINE PacketXi preverse(
const PacketXi& a) {
259 EIGEN_STRONG_INLINE PacketXi pabs(
const PacketXi& a) {
260 return svabs_s32_x(svptrue_b32(), a);
264 EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(
const PacketXi& a) {
265 return static_cast<numext::int32_t
>(svaddv_s32(svptrue_b32(), a));
269 EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(
const PacketXi& a) {
270 EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
273 svint32_t prod = svmul_s32_x(svptrue_b32(), a, svrev_s32(a));
277 if (EIGEN_ARM64_SVE_VL >= 2048) {
278 half_prod = svtbl_s32(prod, svindex_u32(32, 1));
279 prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
281 if (EIGEN_ARM64_SVE_VL >= 1024) {
282 half_prod = svtbl_s32(prod, svindex_u32(16, 1));
283 prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
285 if (EIGEN_ARM64_SVE_VL >= 512) {
286 half_prod = svtbl_s32(prod, svindex_u32(8, 1));
287 prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
289 if (EIGEN_ARM64_SVE_VL >= 256) {
290 half_prod = svtbl_s32(prod, svindex_u32(4, 1));
291 prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
294 half_prod = svtbl_s32(prod, svindex_u32(2, 1));
295 prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
298 return pfirst<PacketXi>(prod);
302 EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(
const PacketXi& a) {
303 return svminv_s32(svptrue_b32(), a);
307 EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(
const PacketXi& a) {
308 return svmaxv_s32(svptrue_b32(), a);
312 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
313 int buffer[packet_traits<numext::int32_t>::size * N] = {0};
316 PacketXi stride_index = svindex_s32(0, N);
318 for (i = 0; i < N; i++) {
319 svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
321 for (i = 0; i < N; i++) {
322 kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
328 typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
331 struct packet_traits<float> : default_packet_traits {
332 typedef PacketXf type;
333 typedef PacketXf half;
338 size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
358 HasSin = EIGEN_FAST_MATH,
359 HasCos = EIGEN_FAST_MATH,
364 HasTanh = EIGEN_FAST_MATH,
365 HasErf = EIGEN_FAST_MATH,
366 HasErfc = EIGEN_FAST_MATH
371 struct unpacket_traits<PacketXf> {
373 typedef PacketXf half;
374 typedef PacketXi integer_packet;
377 size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
380 masked_load_available =
false,
381 masked_store_available =
false 386 EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(
const float& from) {
387 return svdup_n_f32(from);
391 EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from) {
392 return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), from));
396 EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(
const float& a) {
397 float c[packet_traits<float>::size];
398 for (
int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
399 return svadd_f32_x(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
403 EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
404 return svadd_f32_x(svptrue_b32(), a, b);
408 EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
409 return svsub_f32_x(svptrue_b32(), a, b);
413 EIGEN_STRONG_INLINE PacketXf pnegate(
const PacketXf& a) {
414 return svneg_f32_x(svptrue_b32(), a);
418 EIGEN_STRONG_INLINE PacketXf pconj(
const PacketXf& a) {
423 EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
424 return svmul_f32_x(svptrue_b32(), a, b);
428 EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
429 return svdiv_f32_x(svptrue_b32(), a, b);
433 EIGEN_STRONG_INLINE PacketXf pmadd(
const PacketXf& a,
const PacketXf& b,
const PacketXf& c) {
434 return svmla_f32_x(svptrue_b32(), c, a, b);
438 EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
439 return svmin_f32_x(svptrue_b32(), a, b);
443 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(
const PacketXf& a,
const PacketXf& b) {
444 return pmin<PacketXf>(a, b);
448 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(
const PacketXf& a,
const PacketXf& b) {
449 return svminnm_f32_x(svptrue_b32(), a, b);
453 EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
454 return svmax_f32_x(svptrue_b32(), a, b);
458 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(
const PacketXf& a,
const PacketXf& b) {
459 return pmax<PacketXf>(a, b);
463 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(
const PacketXf& a,
const PacketXf& b) {
464 return svmaxnm_f32_x(svptrue_b32(), a, b);
470 EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
471 return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu));
475 EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
476 return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
480 EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
481 return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
488 EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
489 return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
493 EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(
const PacketXf& a) {
494 return svrintm_f32_x(svptrue_b32(), a);
498 EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(
const PacketXf& ) {
499 return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), 0xffffffffu));
504 EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
505 return svreinterpret_f32_u32(svand_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
509 EIGEN_STRONG_INLINE PacketXf por<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
510 return svreinterpret_f32_u32(svorr_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
514 EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
515 return svreinterpret_f32_u32(sveor_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
519 EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(
const PacketXf& a,
const PacketXf& b) {
520 return svreinterpret_f32_u32(svbic_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
524 EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(
const float* from) {
525 EIGEN_DEBUG_ALIGNED_LOAD
return svld1_f32(svptrue_b32(), from);
529 EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(
const float* from) {
530 EIGEN_DEBUG_UNALIGNED_LOAD
return svld1_f32(svptrue_b32(), from);
534 EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(
const float* from) {
535 svuint32_t indices = svindex_u32(0, 1);
536 indices = svzip1_u32(indices, indices);
537 return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
541 EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(
const float* from) {
542 svuint32_t indices = svindex_u32(0, 1);
543 indices = svzip1_u32(indices, indices);
544 indices = svzip1_u32(indices, indices);
545 return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
549 EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const PacketXf& from) {
550 EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
554 EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const PacketXf& from) {
555 EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
559 EIGEN_DEVICE_FUNC
inline PacketXf pgather<float, PacketXf>(
const float* from,
Index stride) {
561 svint32_t indices = svindex_s32(0, stride);
562 return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
566 EIGEN_DEVICE_FUNC
inline void pscatter<float, PacketXf>(
float* to,
const PacketXf& from,
Index stride) {
568 svint32_t indices = svindex_s32(0, stride);
569 svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
573 EIGEN_STRONG_INLINE
float pfirst<PacketXf>(
const PacketXf& a) {
575 return svlasta_f32(svpfalse_b(), a);
579 EIGEN_STRONG_INLINE PacketXf preverse(
const PacketXf& a) {
584 EIGEN_STRONG_INLINE PacketXf pabs(
const PacketXf& a) {
585 return svabs_f32_x(svptrue_b32(), a);
591 EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(
const PacketXf& a, PacketXf& exponent) {
592 return pfrexp_generic(a, exponent);
596 EIGEN_STRONG_INLINE
float predux<PacketXf>(
const PacketXf& a) {
597 return svaddv_f32(svptrue_b32(), a);
604 EIGEN_STRONG_INLINE
float predux_mul<PacketXf>(
const PacketXf& a) {
605 EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
607 svfloat32_t prod = svmul_f32_x(svptrue_b32(), a, svrev_f32(a));
608 svfloat32_t half_prod;
611 if (EIGEN_ARM64_SVE_VL >= 2048) {
612 half_prod = svtbl_f32(prod, svindex_u32(32, 1));
613 prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
615 if (EIGEN_ARM64_SVE_VL >= 1024) {
616 half_prod = svtbl_f32(prod, svindex_u32(16, 1));
617 prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
619 if (EIGEN_ARM64_SVE_VL >= 512) {
620 half_prod = svtbl_f32(prod, svindex_u32(8, 1));
621 prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
623 if (EIGEN_ARM64_SVE_VL >= 256) {
624 half_prod = svtbl_f32(prod, svindex_u32(4, 1));
625 prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
628 half_prod = svtbl_f32(prod, svindex_u32(2, 1));
629 prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
632 return pfirst<PacketXf>(prod);
636 EIGEN_STRONG_INLINE
float predux_min<PacketXf>(
const PacketXf& a) {
637 return svminv_f32(svptrue_b32(), a);
641 EIGEN_STRONG_INLINE
float predux_max<PacketXf>(
const PacketXf& a) {
642 return svmaxv_f32(svptrue_b32(), a);
646 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<PacketXf, N>& kernel) {
647 float buffer[packet_traits<float>::size * N] = {0};
650 PacketXi stride_index = svindex_s32(0, N);
652 for (i = 0; i < N; i++) {
653 svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
656 for (i = 0; i < N; i++) {
657 kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
662 EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(
const PacketXf& a,
const PacketXf& exponent) {
663 return pldexp_generic(a, exponent);
667 EIGEN_STRONG_INLINE PacketXf psqrt<PacketXf>(
const PacketXf& a) {
668 return svsqrt_f32_x(svptrue_b32(), a);
674 #endif // EIGEN_PACKET_MATH_SVE_H Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
Definition: Constants.h:239
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82