10 #ifndef EIGEN_COMPLEX_AVX_H 11 #define EIGEN_COMPLEX_AVX_H 14 #include "../../InternalHeaderCheck.h" 22 EIGEN_STRONG_INLINE Packet4cf() {}
23 EIGEN_STRONG_INLINE
explicit Packet4cf(
const __m256& a) : v(a) {}
27 #ifndef EIGEN_VECTORIZE_AVX512 29 struct packet_traits<
std::complex<float> > : default_packet_traits {
30 typedef Packet4cf type;
31 typedef Packet2cf half;
55 struct unpacket_traits<Packet4cf> {
56 typedef std::complex<float> type;
57 typedef Packet2cf half;
58 typedef Packet8f as_real;
63 masked_load_available =
false,
64 masked_store_available =
false 69 EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
70 return Packet4cf(_mm256_add_ps(a.v, b.v));
73 EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
74 return Packet4cf(_mm256_sub_ps(a.v, b.v));
77 EIGEN_STRONG_INLINE Packet4cf pnegate(
const Packet4cf& a) {
78 return Packet4cf(pnegate(a.v));
81 EIGEN_STRONG_INLINE Packet4cf pconj(
const Packet4cf& a) {
82 const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
83 0x80000000, 0x00000000, 0x80000000));
84 return Packet4cf(_mm256_xor_ps(a.v, mask));
88 EIGEN_STRONG_INLINE Packet4cf pmul(
const Packet4cf& a,
const Packet4cf& b) {
89 __m256 tmp1 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
90 __m256 tmp2 = _mm256_moveldup_ps(a.v);
91 #ifdef EIGEN_VECTORIZE_FMA 92 __m256 result = _mm256_fmaddsub_ps(tmp2, b.v, tmp1);
94 __m256 result = _mm256_addsub_ps(_mm256_mul_ps(tmp2, b.v), tmp1);
96 return Packet4cf(result);
100 EIGEN_STRONG_INLINE Packet4cf pcmp_eq(
const Packet4cf& a,
const Packet4cf& b) {
101 __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
102 return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
106 EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(
const Packet4cf& a) {
107 return Packet4cf(ptrue(Packet8f(a.v)));
110 EIGEN_STRONG_INLINE Packet4cf pand<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
111 return Packet4cf(_mm256_and_ps(a.v, b.v));
114 EIGEN_STRONG_INLINE Packet4cf por<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
115 return Packet4cf(_mm256_or_ps(a.v, b.v));
118 EIGEN_STRONG_INLINE Packet4cf pxor<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
119 return Packet4cf(_mm256_xor_ps(a.v, b.v));
122 EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
123 return Packet4cf(_mm256_andnot_ps(b.v, a.v));
127 EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(
const std::complex<float>* from) {
128 EIGEN_DEBUG_ALIGNED_LOAD
return Packet4cf(_mm256_load_ps(&numext::real_ref(*from)));
131 EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(
const std::complex<float>* from) {
132 EIGEN_DEBUG_UNALIGNED_LOAD
return Packet4cf(_mm256_loadu_ps(&numext::real_ref(*from)));
136 EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(
const std::complex<float>& from) {
137 const float re = std::real(from);
138 const float im = std::imag(from);
139 return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
143 EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(
const std::complex<float>* from) {
145 Packet2cf a = ploaddup<Packet2cf>(from);
146 Packet2cf b = ploaddup<Packet2cf>(from + 1);
147 return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
151 EIGEN_STRONG_INLINE
void pstore<std::complex<float> >(std::complex<float>* to,
const Packet4cf& from) {
152 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(&numext::real_ref(*to), from.v);
155 EIGEN_STRONG_INLINE
void pstoreu<std::complex<float> >(std::complex<float>* to,
const Packet4cf& from) {
156 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(&numext::real_ref(*to), from.v);
160 EIGEN_DEVICE_FUNC
inline Packet4cf pgather<std::complex<float>, Packet4cf>(
const std::complex<float>* from,
162 return Packet4cf(_mm256_set_ps(std::imag(from[3 * stride]), std::real(from[3 * stride]), std::imag(from[2 * stride]),
163 std::real(from[2 * stride]), std::imag(from[1 * stride]), std::real(from[1 * stride]),
164 std::imag(from[0 * stride]), std::real(from[0 * stride])));
168 EIGEN_DEVICE_FUNC
inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to,
const Packet4cf& from,
170 __m128 low = _mm256_extractf128_ps(from.v, 0);
172 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
174 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
176 __m128 high = _mm256_extractf128_ps(from.v, 1);
178 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
180 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
184 EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(
const Packet4cf& a) {
185 return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
189 EIGEN_STRONG_INLINE Packet4cf preverse(
const Packet4cf& a) {
190 __m128 low = _mm256_extractf128_ps(a.v, 0);
191 __m128 high = _mm256_extractf128_ps(a.v, 1);
192 __m128d lowd = _mm_castps_pd(low);
193 __m128d highd = _mm_castps_pd(high);
194 low = _mm_castpd_ps(_mm_shuffle_pd(lowd, lowd, 0x1));
195 high = _mm_castpd_ps(_mm_shuffle_pd(highd, highd, 0x1));
196 __m256 result = _mm256_setzero_ps();
197 result = _mm256_insertf128_ps(result, low, 1);
198 result = _mm256_insertf128_ps(result, high, 0);
199 return Packet4cf(result);
203 EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(
const Packet4cf& a) {
204 return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
208 EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(
const Packet4cf& a) {
209 return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
212 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf, Packet8f)
215 EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
216 return pdiv_complex(a, b);
220 EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(
const Packet4cf& x) {
221 return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
226 EIGEN_STRONG_INLINE Packet2cd() {}
227 EIGEN_STRONG_INLINE
explicit Packet2cd(
const __m256d& a) : v(a) {}
231 #ifndef EIGEN_VECTORIZE_AVX512 233 struct packet_traits<
std::complex<double> > : default_packet_traits {
234 typedef Packet2cd type;
235 typedef Packet1cd half;
258 struct unpacket_traits<Packet2cd> {
259 typedef std::complex<double> type;
260 typedef Packet1cd half;
261 typedef Packet4d as_real;
266 masked_load_available =
false,
267 masked_store_available =
false 272 EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
273 return Packet2cd(_mm256_add_pd(a.v, b.v));
276 EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
277 return Packet2cd(_mm256_sub_pd(a.v, b.v));
280 EIGEN_STRONG_INLINE Packet2cd pnegate(
const Packet2cd& a) {
281 return Packet2cd(pnegate(a.v));
284 EIGEN_STRONG_INLINE Packet2cd pconj(
const Packet2cd& a) {
285 const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
286 return Packet2cd(_mm256_xor_pd(a.v, mask));
290 EIGEN_STRONG_INLINE Packet2cd pmul(
const Packet2cd& a,
const Packet2cd& b) {
291 __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0xF), _mm256_permute_pd(b.v, 0x5));
292 __m256d tmp2 = _mm256_movedup_pd(a.v);
293 #ifdef EIGEN_VECTORIZE_FMA 294 __m256d result = _mm256_fmaddsub_pd(tmp2, b.v, tmp1);
296 __m256d result = _mm256_addsub_pd(_mm256_mul_pd(tmp2, b.v), tmp1);
298 return Packet2cd(result);
302 EIGEN_STRONG_INLINE Packet2cd pcmp_eq(
const Packet2cd& a,
const Packet2cd& b) {
303 __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
304 return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
308 EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(
const Packet2cd& a) {
309 return Packet2cd(ptrue(Packet4d(a.v)));
312 EIGEN_STRONG_INLINE Packet2cd pand<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
313 return Packet2cd(_mm256_and_pd(a.v, b.v));
316 EIGEN_STRONG_INLINE Packet2cd por<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
317 return Packet2cd(_mm256_or_pd(a.v, b.v));
320 EIGEN_STRONG_INLINE Packet2cd pxor<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
321 return Packet2cd(_mm256_xor_pd(a.v, b.v));
324 EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
325 return Packet2cd(_mm256_andnot_pd(b.v, a.v));
329 EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(
const std::complex<double>* from) {
330 EIGEN_DEBUG_ALIGNED_LOAD
return Packet2cd(_mm256_load_pd((
const double*)from));
333 EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(
const std::complex<double>* from) {
334 EIGEN_DEBUG_UNALIGNED_LOAD
return Packet2cd(_mm256_loadu_pd((
const double*)from));
338 EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(
const std::complex<double>& from) {
341 return Packet2cd(_mm256_broadcast_pd((
const __m128d*)(
const void*)&from));
345 EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(
const std::complex<double>* from) {
346 return pset1<Packet2cd>(*from);
350 EIGEN_STRONG_INLINE
void pstore<std::complex<double> >(std::complex<double>* to,
const Packet2cd& from) {
351 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd((
double*)to, from.v);
354 EIGEN_STRONG_INLINE
void pstoreu<std::complex<double> >(std::complex<double>* to,
const Packet2cd& from) {
355 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd((
double*)to, from.v);
359 EIGEN_DEVICE_FUNC
inline Packet2cd pgather<std::complex<double>, Packet2cd>(
const std::complex<double>* from,
361 return Packet2cd(_mm256_set_pd(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
362 std::real(from[0 * stride])));
366 EIGEN_DEVICE_FUNC
inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to,
const Packet2cd& from,
368 __m128d low = _mm256_extractf128_pd(from.v, 0);
369 to[stride * 0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
370 __m128d high = _mm256_extractf128_pd(from.v, 1);
371 to[stride * 1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
375 EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(
const Packet2cd& a) {
376 __m128d low = _mm256_extractf128_pd(a.v, 0);
377 EIGEN_ALIGN16
double res[2];
378 _mm_store_pd(res, low);
379 return std::complex<double>(res[0], res[1]);
383 EIGEN_STRONG_INLINE Packet2cd preverse(
const Packet2cd& a) {
384 __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
385 return Packet2cd(result);
389 EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(
const Packet2cd& a) {
390 return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
394 EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(
const Packet2cd& a) {
395 return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
398 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd, Packet4d)
401 EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
402 return pdiv_complex(a, b);
406 EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(
const Packet2cd& x) {
407 return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
410 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4cf, 4>& kernel) {
411 __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
412 __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
413 __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
414 __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
416 __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
417 __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
418 __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
419 __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
421 kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
422 kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
423 kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
424 kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
427 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2cd, 2>& kernel) {
428 __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0 + (2 << 4));
429 kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1 + (3 << 4));
430 kernel.packet[0].v = tmp;
434 EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(
const Packet2cd& a) {
435 return psqrt_complex<Packet2cd>(a);
439 EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(
const Packet4cf& a) {
440 return psqrt_complex<Packet4cf>(a);
444 EIGEN_STRONG_INLINE Packet2cd plog<Packet2cd>(
const Packet2cd& a) {
445 return plog_complex<Packet2cd>(a);
449 EIGEN_STRONG_INLINE Packet4cf plog<Packet4cf>(
const Packet4cf& a) {
450 return plog_complex<Packet4cf>(a);
454 EIGEN_STRONG_INLINE Packet4cf pexp<Packet4cf>(
const Packet4cf& a) {
455 return pexp_complex<Packet4cf>(a);
458 #ifdef EIGEN_VECTORIZE_FMA 461 EIGEN_STRONG_INLINE Packet4cf pmadd(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
462 __m256 a_odd = _mm256_movehdup_ps(a.v);
463 __m256 a_even = _mm256_moveldup_ps(a.v);
464 __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
465 __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmaddsub_ps(a_odd, b_swap, c.v));
466 return Packet4cf(result);
469 EIGEN_STRONG_INLINE Packet4cf pmsub(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
470 __m256 a_odd = _mm256_movehdup_ps(a.v);
471 __m256 a_even = _mm256_moveldup_ps(a.v);
472 __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
473 __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmsubadd_ps(a_odd, b_swap, c.v));
474 return Packet4cf(result);
477 EIGEN_STRONG_INLINE Packet4cf pnmadd(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
478 return pnegate(pmsub(a, b, c));
481 EIGEN_STRONG_INLINE Packet4cf pnmsub(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
482 return pnegate(pmadd(a, b, c));
486 EIGEN_STRONG_INLINE Packet2cd pmadd(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
487 __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
488 __m256d a_even = _mm256_movedup_pd(a.v);
489 __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
490 __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmaddsub_pd(a_odd, b_swap, c.v));
491 return Packet2cd(result);
494 EIGEN_STRONG_INLINE Packet2cd pmsub(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
495 __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
496 __m256d a_even = _mm256_movedup_pd(a.v);
497 __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
498 __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmsubadd_pd(a_odd, b_swap, c.v));
499 return Packet2cd(result);
502 EIGEN_STRONG_INLINE Packet2cd pnmadd(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
503 return pnegate(pmsub(a, b, c));
506 EIGEN_STRONG_INLINE Packet2cd pnmsub(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
507 return pnegate(pmadd(a, b, c));
516 struct has_packet_segment<Packet2cf> : std::true_type {};
519 struct has_packet_segment<Packet4cf> : std::true_type {};
522 inline Packet2cf ploaduSegment<Packet2cf>(
const std::complex<float>* from,
Index begin,
Index count) {
523 return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
527 inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to,
const Packet2cf& from,
Index begin,
529 _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
533 inline Packet4cf ploaduSegment<Packet4cf>(
const std::complex<float>* from,
Index begin,
Index count) {
534 return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
538 inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to,
const Packet4cf& from,
Index begin,
540 _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
546 struct has_packet_segment<Packet2cd> : std::true_type {};
549 inline Packet2cd ploaduSegment<Packet2cd>(
const std::complex<double>* from,
Index begin,
Index count) {
550 return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
554 inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to,
const Packet2cd& from,
556 _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
565 #endif // EIGEN_COMPLEX_AVX_H Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
Definition: BFloat16.h:231
Definition: Constants.h:238
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82