$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
11 #define EIGEN_PACKET_MATH_ZVECTOR_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
22 #endif
23 
24 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
26 #endif
27 
28 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
29 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30 #endif
31 
32 typedef __vector int Packet4i;
33 typedef __vector unsigned int Packet4ui;
34 typedef __vector __bool int Packet4bi;
35 typedef __vector short int Packet8i;
36 typedef __vector unsigned char Packet16uc;
37 typedef __vector double Packet2d;
38 typedef __vector unsigned long long Packet2ul;
39 typedef __vector long long Packet2l;
40 
41 // Z14 has builtin support for float vectors
42 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
43 typedef __vector float Packet4f;
44 #else
45 typedef struct {
46  Packet2d v4f[2];
47 } Packet4f;
48 #endif
49 
50 typedef union {
51  numext::int32_t i[4];
52  numext::uint32_t ui[4];
53  numext::int64_t l[2];
54  numext::uint64_t ul[2];
55  double d[2];
56  float f[4];
57  Packet4i v4i;
58  Packet4ui v4ui;
59  Packet2l v2l;
60  Packet2ul v2ul;
61  Packet2d v2d;
62 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
63  Packet4f v4f;
64 #endif
65 } Packet;
66 
67 // We don't want to write the same code all the time, but we need to reuse the constants
68 // and it doesn't really work to declare them global, so we define macros instead
69 
70 #define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
71 
72 #define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
73 
74 #define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
75 
76 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
77 
78 #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
79 
80 #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
81 
82 // These constants are endian-agnostic
83 static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
84 static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
85 
86 static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
87 static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
88 static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
89 
90 static Packet2d p2d_ONE = {1.0, 1.0};
91 static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
92  numext::bit_cast<double>(0x8000000000000000ull)};
93 
94 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
95 #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
96 
97 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
98 
99 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
100  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
101 
102 static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
103 static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
104 static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
105 #endif
106 
107 static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
108 static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
109 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
110  vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
111 
112 static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
113 static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
114 
115 // Mask alignment
116 #define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
117 
118 #define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
119 
120 // Handle endianness properly while loading constants
121 // Define global static constants:
122 
123 static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
124 static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
125 static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
126 
127 static Packet16uc p16uc_PSET32_WODD =
128  vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
129  8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
130 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131  8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
132 /*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
133 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
134 
135 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
136 (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
137 static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
138  (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
139 /*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
140 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
141 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
142 static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
143 static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
144 
145 static Packet16uc p16uc_COMPLEX32_REV =
146  vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
147 
148 static Packet16uc p16uc_COMPLEX32_REV2 =
149  vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
150 
151 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152 #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
153 #else
154 #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm(" pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
155 #endif
156 
157 template <>
158 struct packet_traits<int> : default_packet_traits {
159  typedef Packet4i type;
160  typedef Packet4i half;
161  enum {
162  Vectorizable = 1,
163  AlignedOnScalar = 1,
164  size = 4,
165 
166  HasAdd = 1,
167  HasSub = 1,
168  HasMul = 1,
169  HasDiv = 1,
170  HasBlend = 1
171  };
172 };
173 
174 template <>
175 struct packet_traits<float> : default_packet_traits {
176  typedef Packet4f type;
177  typedef Packet4f half;
178  enum {
179  Vectorizable = 1,
180  AlignedOnScalar = 1,
181  size = 4,
182 
183  HasCmp = 1,
184  HasAdd = 1,
185  HasSub = 1,
186  HasMul = 1,
187  HasDiv = 1,
188  HasMin = 1,
189  HasMax = 1,
190  HasAbs = 1,
191  HasSin = 0,
192  HasCos = 0,
193  HasLog = 0,
194  HasExp = 1,
195  HasSqrt = 1,
196  HasRsqrt = 1,
197  HasTanh = 1,
198  HasErf = 1,
199  HasNegate = 1,
200  HasBlend = 1
201  };
202 };
203 
204 template <>
205 struct packet_traits<double> : default_packet_traits {
206  typedef Packet2d type;
207  typedef Packet2d half;
208  enum {
209  Vectorizable = 1,
210  AlignedOnScalar = 1,
211  size = 2,
212 
213  HasAdd = 1,
214  HasSub = 1,
215  HasMul = 1,
216  HasDiv = 1,
217  HasMin = 1,
218  HasMax = 1,
219  HasAbs = 1,
220  HasSin = 0,
221  HasCos = 0,
222  HasLog = 0,
223  HasExp = 1,
224  HasSqrt = 1,
225  HasRsqrt = 1,
226  HasNegate = 1,
227  HasBlend = 1
228  };
229 };
230 
231 template <>
232 struct unpacket_traits<Packet4i> {
233  typedef int type;
234  enum {
235  size = 4,
236  alignment = Aligned16,
237  vectorizable = true,
238  masked_load_available = false,
239  masked_store_available = false
240  };
241  typedef Packet4i half;
242 };
243 template <>
244 struct unpacket_traits<Packet4f> {
245  typedef float type;
246  enum {
247  size = 4,
248  alignment = Aligned16,
249  vectorizable = true,
250  masked_load_available = false,
251  masked_store_available = false
252  };
253  typedef Packet4f half;
254  typedef Packet4i integer_packet;
255 };
256 template <>
257 struct unpacket_traits<Packet2d> {
258  typedef double type;
259  enum {
260  size = 2,
261  alignment = Aligned16,
262  vectorizable = true,
263  masked_load_available = false,
264  masked_store_available = false
265  };
266  typedef Packet2d half;
267  typedef Packet2l integer_packet;
268 };
269 
270 /* Forward declaration */
271 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
272 
273 inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
274  Packet vt;
275  vt.v4i = v;
276  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
277  return s;
278 }
279 
280 inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
281  Packet vt;
282  vt.v4ui = v;
283  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
284  return s;
285 }
286 
287 inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
288  Packet vt;
289  vt.v2l = v;
290  s << vt.l[0] << ", " << vt.l[1];
291  return s;
292 }
293 
294 inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
295  Packet vt;
296  vt.v2ul = v;
297  s << vt.ul[0] << ", " << vt.ul[1];
298  return s;
299 }
300 
301 inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
302  Packet vt;
303  vt.v2d = v;
304  s << vt.d[0] << ", " << vt.d[1];
305  return s;
306 }
307 
308 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
309 inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
310  Packet vt;
311  vt.v4f = v;
312  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
313  return s;
314 }
315 #endif
316 
317 template <>
318 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
319  EIGEN_DEBUG_ALIGNED_LOAD
320  return vec_xl(0, from);
321 }
322 
323 template <>
324 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
325  EIGEN_DEBUG_ALIGNED_LOAD
326  return vec_xl(0, from);
327 }
328 
329 template <>
330 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
331  EIGEN_DEBUG_ALIGNED_STORE
332  vec_xst(from, 0, to);
333 }
334 
335 template <>
336 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
337  EIGEN_DEBUG_ALIGNED_STORE
338  vec_xst(from, 0, to);
339 }
340 
341 template <>
342 EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
343  return pfrexp_generic(a, exponent);
344 }
345 
346 template <>
347 EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
348  return pfrexp_generic(a, exponent);
349 }
350 
351 template <>
352 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
353  return vec_splats(from);
354 }
355 template <>
356 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
357  return vec_splats(from);
358 }
359 
360 template <>
361 EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
362  a3 = pload<Packet4i>(a);
363  a0 = vec_splat(a3, 0);
364  a1 = vec_splat(a3, 1);
365  a2 = vec_splat(a3, 2);
366  a3 = vec_splat(a3, 3);
367 }
368 
369 template <>
370 EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
371  Packet2d& a3) {
372  a1 = pload<Packet2d>(a);
373  a0 = vec_splat(a1, 0);
374  a1 = vec_splat(a1, 1);
375  a3 = pload<Packet2d>(a + 2);
376  a2 = vec_splat(a3, 0);
377  a3 = vec_splat(a3, 1);
378 }
379 
380 template <>
381 EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
382  EIGEN_ALIGN16 int ai[4];
383  ai[0] = from[0 * stride];
384  ai[1] = from[1 * stride];
385  ai[2] = from[2 * stride];
386  ai[3] = from[3 * stride];
387  return pload<Packet4i>(ai);
388 }
389 
390 template <>
391 EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
392  EIGEN_ALIGN16 double af[2];
393  af[0] = from[0 * stride];
394  af[1] = from[1 * stride];
395  return pload<Packet2d>(af);
396 }
397 
398 template <>
399 EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
400  EIGEN_ALIGN16 int ai[4];
401  pstore<int>((int*)ai, from);
402  to[0 * stride] = ai[0];
403  to[1 * stride] = ai[1];
404  to[2 * stride] = ai[2];
405  to[3 * stride] = ai[3];
406 }
407 
408 template <>
409 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
410  EIGEN_ALIGN16 double af[2];
411  pstore<double>(af, from);
412  to[0 * stride] = af[0];
413  to[1 * stride] = af[1];
414 }
415 
416 template <>
417 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
418  return (a + b);
419 }
420 template <>
421 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
422  return (a + b);
423 }
424 
425 template <>
426 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
427  return (a - b);
428 }
429 template <>
430 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
431  return (a - b);
432 }
433 
434 template <>
435 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
436  return (a * b);
437 }
438 template <>
439 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
440  return (a * b);
441 }
442 
443 template <>
444 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
445  return (a / b);
446 }
447 template <>
448 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
449  return (a / b);
450 }
451 
452 template <>
453 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
454  return (-a);
455 }
456 template <>
457 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
458  return (-a);
459 }
460 
461 template <>
462 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
463  return a;
464 }
465 template <>
466 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
467  return a;
468 }
469 
470 template <>
471 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
472  return padd<Packet4i>(pmul<Packet4i>(a, b), c);
473 }
474 template <>
475 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
476  return vec_madd(a, b, c);
477 }
478 
479 template <>
480 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
481  return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
482 }
483 template <>
484 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
485  return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
486 }
487 
488 template <>
489 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
490  return vec_min(a, b);
491 }
492 template <>
493 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
494  return vec_min(a, b);
495 }
496 
497 template <>
498 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
499  return vec_max(a, b);
500 }
501 template <>
502 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
503  return vec_max(a, b);
504 }
505 
506 template <>
507 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
508  return vec_and(a, b);
509 }
510 template <>
511 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
512  return vec_and(a, b);
513 }
514 
515 template <>
516 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
517  return vec_or(a, b);
518 }
519 template <>
520 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
521  return vec_or(a, b);
522 }
523 
524 template <>
525 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
526  return vec_xor(a, b);
527 }
528 template <>
529 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
530  return vec_xor(a, b);
531 }
532 
533 template <>
534 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
535  return pand<Packet4i>(a, vec_nor(b, b));
536 }
537 template <>
538 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
539  return vec_and(a, vec_nor(b, b));
540 }
541 
542 template <>
543 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
544  /* Uses non-default rounding for vec_round */
545  return __builtin_s390_vfidb(a, 0, 1);
546 }
547 template <>
548 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
549  return vec_ceil(a);
550 }
551 template <>
552 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
553  return vec_floor(a);
554 }
555 
556 template <>
557 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
558  return pload<Packet4i>(from);
559 }
560 template <>
561 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
562  return pload<Packet2d>(from);
563 }
564 
565 template <>
566 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
567  Packet4i p = pload<Packet4i>(from);
568  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
569 }
570 
571 template <>
572 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
573  Packet2d p = pload<Packet2d>(from);
574  return vec_perm(p, p, p16uc_PSET64_HI);
575 }
576 
577 template <>
578 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
579  pstore<int>(to, from);
580 }
581 template <>
582 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
583  pstore<double>(to, from);
584 }
585 
586 template <>
587 EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
588  EIGEN_ZVECTOR_PREFETCH(addr);
589 }
590 template <>
591 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
592  EIGEN_ZVECTOR_PREFETCH(addr);
593 }
594 
595 template <int N>
596 EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
597  return Packet2l { parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]) };
598 }
599 template <int N>
600 EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
601  return Packet4i {
602  parithmetic_shift_right<N>(a[0]),
603  parithmetic_shift_right<N>(a[1]),
604  parithmetic_shift_right<N>(a[2]),
605  parithmetic_shift_right<N>(a[3]) };
606 }
607 
608 template <int N>
609 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
610  return Packet2l { plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]) };
611 }
612 template <int N>
613 EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
614  return Packet4i {
615  plogical_shift_right<N>(a[0]),
616  plogical_shift_right<N>(a[1]),
617  plogical_shift_right<N>(a[2]),
618  plogical_shift_right<N>(a[3]) };
619 }
620 
621 template <int N>
622 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
623  return Packet2l { plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]) };
624 }
625 template <int N>
626 EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
627  return Packet4i {
628  plogical_shift_left<N>(a[0]),
629  plogical_shift_left<N>(a[1]),
630  plogical_shift_left<N>(a[2]),
631  plogical_shift_left<N>(a[3]) };
632 }
633 
634 template <>
635 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
636  EIGEN_ALIGN16 int x[4];
637  pstore(x, a);
638  return x[0];
639 }
640 template <>
641 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
642  EIGEN_ALIGN16 double x[2];
643  pstore(x, a);
644  return x[0];
645 }
646 
647 template <>
648 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
649  return reinterpret_cast<Packet4i>(
650  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
651 }
652 
653 template <>
654 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
655  return reinterpret_cast<Packet2d>(
656  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
657 }
658 
659 template <>
660 EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
661  return vec_abs(a);
662 }
663 template <>
664 EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
665  return vec_abs(a);
666 }
667 
668 template <>
669 EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
670  Packet4i b, sum;
671  b = vec_sld(a, a, 8);
672  sum = padd<Packet4i>(a, b);
673  b = vec_sld(sum, sum, 4);
674  sum = padd<Packet4i>(sum, b);
675  return pfirst(sum);
676 }
677 
678 template <>
679 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
680  Packet2d b, sum;
681  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
682  sum = padd<Packet2d>(a, b);
683  return pfirst(sum);
684 }
685 
686 // Other reduction functions:
687 // mul
688 template <>
689 EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
690  EIGEN_ALIGN16 int aux[4];
691  pstore(aux, a);
692  return aux[0] * aux[1] * aux[2] * aux[3];
693 }
694 
695 template <>
696 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
697  return pfirst(
698  pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
699 }
700 
701 // min
702 template <>
703 EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
704  Packet4i b, res;
705  b = pmin<Packet4i>(a, vec_sld(a, a, 8));
706  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
707  return pfirst(res);
708 }
709 
710 template <>
711 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
712  return pfirst(pmin<Packet2d>(
713  a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
714 }
715 
716 // max
717 template <>
718 EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
719  Packet4i b, res;
720  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
721  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
722  return pfirst(res);
723 }
724 
725 // max
726 template <>
727 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
728  return pfirst(pmax<Packet2d>(
729  a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
730 }
731 
732 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
733  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
734  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
735  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
736  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
737  kernel.packet[0] = vec_mergeh(t0, t2);
738  kernel.packet[1] = vec_mergel(t0, t2);
739  kernel.packet[2] = vec_mergeh(t1, t3);
740  kernel.packet[3] = vec_mergel(t1, t3);
741 }
742 
743 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
744  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
745  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
746  kernel.packet[0] = t0;
747  kernel.packet[1] = t1;
748 }
749 
750 template <>
751 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
752  const Packet4i& elsePacket) {
753  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
754  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
755  return vec_sel(elsePacket, thenPacket, mask);
756 }
757 
758 template <>
759 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
760  const Packet2d& elsePacket) {
761  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
762  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
763  return vec_sel(elsePacket, thenPacket, mask);
764 }
765 
766 /* z13 has no vector float support so we emulate that with double
767  z14 has proper vector float support.
768 */
769 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
770 /* Helper function to simulate a vec_splat_packet4f
771  */
772 template <int element>
773 EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
774  Packet4f splat;
775  switch (element) {
776  case 0:
777  splat.v4f[0] = vec_splat(from.v4f[0], 0);
778  splat.v4f[1] = splat.v4f[0];
779  break;
780  case 1:
781  splat.v4f[0] = vec_splat(from.v4f[0], 1);
782  splat.v4f[1] = splat.v4f[0];
783  break;
784  case 2:
785  splat.v4f[0] = vec_splat(from.v4f[1], 0);
786  splat.v4f[1] = splat.v4f[0];
787  break;
788  case 3:
789  splat.v4f[0] = vec_splat(from.v4f[1], 1);
790  splat.v4f[1] = splat.v4f[0];
791  break;
792  }
793  return splat;
794 }
795 
796 template <>
797 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
798  // FIXME: No intrinsic yet
799  EIGEN_DEBUG_ALIGNED_LOAD
800  Packet4f vfrom;
801  vfrom.v4f[0] = vec_ld2f(&from[0]);
802  vfrom.v4f[1] = vec_ld2f(&from[2]);
803  return vfrom;
804 }
805 
806 template <>
807 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
808  // FIXME: No intrinsic yet
809  EIGEN_DEBUG_ALIGNED_STORE
810  vec_st2f(from.v4f[0], &to[0]);
811  vec_st2f(from.v4f[1], &to[2]);
812 }
813 
814 template <>
815 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
816  Packet4f to;
817  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
818  to.v4f[1] = to.v4f[0];
819  return to;
820 }
821 
822 template <>
823 EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
824  a3 = pload<Packet4f>(a);
825  a0 = vec_splat_packet4f<0>(a3);
826  a1 = vec_splat_packet4f<1>(a3);
827  a2 = vec_splat_packet4f<2>(a3);
828  a3 = vec_splat_packet4f<3>(a3);
829 }
830 
831 template <>
832 EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
833  EIGEN_ALIGN16 float ai[4];
834  ai[0] = from[0 * stride];
835  ai[1] = from[1 * stride];
836  ai[2] = from[2 * stride];
837  ai[3] = from[3 * stride];
838  return pload<Packet4f>(ai);
839 }
840 
841 template <>
842 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
843  EIGEN_ALIGN16 float ai[4];
844  pstore<float>((float*)ai, from);
845  to[0 * stride] = ai[0];
846  to[1 * stride] = ai[1];
847  to[2 * stride] = ai[2];
848  to[3 * stride] = ai[3];
849 }
850 
851 template <>
852 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
853  Packet4f c;
854  c.v4f[0] = a.v4f[0] + b.v4f[0];
855  c.v4f[1] = a.v4f[1] + b.v4f[1];
856  return c;
857 }
858 
859 template <>
860 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
861  Packet4f c;
862  c.v4f[0] = a.v4f[0] - b.v4f[0];
863  c.v4f[1] = a.v4f[1] - b.v4f[1];
864  return c;
865 }
866 
867 template <>
868 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
869  Packet4f c;
870  c.v4f[0] = a.v4f[0] * b.v4f[0];
871  c.v4f[1] = a.v4f[1] * b.v4f[1];
872  return c;
873 }
874 
875 template <>
876 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
877  Packet4f c;
878  c.v4f[0] = a.v4f[0] / b.v4f[0];
879  c.v4f[1] = a.v4f[1] / b.v4f[1];
880  return c;
881 }
882 
883 template <>
884 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
885  Packet4f c;
886  c.v4f[0] = -a.v4f[0];
887  c.v4f[1] = -a.v4f[1];
888  return c;
889 }
890 
891 template <>
892 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
893  Packet4f res;
894  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
895  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
896  return res;
897 }
898 
899 template <>
900 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
901  Packet4f res;
902  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
903  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
904  return res;
905 }
906 
907 template <>
908 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
909  Packet4f res;
910  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
911  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
912  return res;
913 }
914 
915 template <>
916 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
917  Packet4f res;
918  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
919  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
920  return res;
921 }
922 
923 template <>
924 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
925  Packet4f res;
926  res.v4f[0] = por(a.v4f[0], b.v4f[0]);
927  res.v4f[1] = por(a.v4f[1], b.v4f[1]);
928  return res;
929 }
930 
931 template <>
932 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
933  Packet4f res;
934  res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
935  res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
936  return res;
937 }
938 
939 template <>
940 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
941  Packet4f res;
942  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
943  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
944  return res;
945 }
946 
947 template <>
948 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
949  Packet4f res;
950  res.v4f[0] = generic_round(a.v4f[0]);
951  res.v4f[1] = generic_round(a.v4f[1]);
952  return res;
953 }
954 
955 template <>
956 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
957  Packet4f res;
958  res.v4f[0] = vec_ceil(a.v4f[0]);
959  res.v4f[1] = vec_ceil(a.v4f[1]);
960  return res;
961 }
962 
963 template <>
964 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
965  Packet4f res;
966  res.v4f[0] = vec_floor(a.v4f[0]);
967  res.v4f[1] = vec_floor(a.v4f[1]);
968  return res;
969 }
970 
971 template <>
972 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
973  Packet4f p = pload<Packet4f>(from);
974  p.v4f[1] = vec_splat(p.v4f[0], 1);
975  p.v4f[0] = vec_splat(p.v4f[0], 0);
976  return p;
977 }
978 
979 template <>
980 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
981  EIGEN_ALIGN16 float x[2];
982  vec_st2f(a.v4f[0], &x[0]);
983  return x[0];
984 }
985 
986 template <>
987 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
988  Packet4f rev;
989  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
990  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
991  return rev;
992 }
993 
994 template <>
995 EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
996  Packet4f res;
997  res.v4f[0] = pabs(a.v4f[0]);
998  res.v4f[1] = pabs(a.v4f[1]);
999  return res;
1000 }
1001 
1002 template <>
1003 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1004  Packet2d sum;
1005  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
1006  double first = predux<Packet2d>(sum);
1007  return static_cast<float>(first);
1008 }
1009 
1010 template <>
1011 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
1012  // Return predux_mul<Packet2d> of the subvectors product
1013  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
1014 }
1015 
1016 template <>
1017 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
1018  Packet2d b, res;
1019  b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
1020  res = pmin<Packet2d>(
1021  b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
1022  return static_cast<float>(pfirst(res));
1023 }
1024 
1025 template <>
1026 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
1027  Packet2d b, res;
1028  b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
1029  res = pmax<Packet2d>(
1030  b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
1031  return static_cast<float>(pfirst(res));
1032 }
1033 
1034 /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
1035  */
1036 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1037  PacketBlock<Packet2d, 2> t0, t1, t2, t3;
1038  // copy top-left 2x2 Packet2d block
1039  t0.packet[0] = kernel.packet[0].v4f[0];
1040  t0.packet[1] = kernel.packet[1].v4f[0];
1041 
1042  // copy top-right 2x2 Packet2d block
1043  t1.packet[0] = kernel.packet[0].v4f[1];
1044  t1.packet[1] = kernel.packet[1].v4f[1];
1045 
1046  // copy bottom-left 2x2 Packet2d block
1047  t2.packet[0] = kernel.packet[2].v4f[0];
1048  t2.packet[1] = kernel.packet[3].v4f[0];
1049 
1050  // copy bottom-right 2x2 Packet2d block
1051  t3.packet[0] = kernel.packet[2].v4f[1];
1052  t3.packet[1] = kernel.packet[3].v4f[1];
1053 
1054  // Transpose all 2x2 blocks
1055  ptranspose(t0);
1056  ptranspose(t1);
1057  ptranspose(t2);
1058  ptranspose(t3);
1059 
1060  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
1061  kernel.packet[0].v4f[0] = t0.packet[0];
1062  kernel.packet[0].v4f[1] = t2.packet[0];
1063  kernel.packet[1].v4f[0] = t0.packet[1];
1064  kernel.packet[1].v4f[1] = t2.packet[1];
1065  kernel.packet[2].v4f[0] = t1.packet[0];
1066  kernel.packet[2].v4f[1] = t3.packet[0];
1067  kernel.packet[3].v4f[0] = t1.packet[1];
1068  kernel.packet[3].v4f[1] = t3.packet[1];
1069 }
1070 
1071 template <>
1072 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
1073  const Packet4f& elsePacket) {
1074  Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
1075  Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
1076  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
1077  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
1078  Packet4f result;
1079  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
1080  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
1081  return result;
1082 }
1083 
1084 template <>
1085 Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1086  Packet4f res;
1087  res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
1088  res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
1089  return res;
1090 }
1091 
1092 template <>
1093 Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1094  Packet4f res;
1095  res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
1096  res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
1097  return res;
1098 }
1099 
1100 template <>
1101 Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1102  Packet4f res;
1103  res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
1104  res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
1105  return res;
1106 }
1107 
1108 #else
1109 template <>
1110 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1111  EIGEN_DEBUG_ALIGNED_LOAD
1112  return vec_xl(0, from);
1113 }
1114 
1115 template <>
1116 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1117  EIGEN_DEBUG_ALIGNED_STORE
1118  vec_xst(from, 0, to);
1119 }
1120 
1121 template <>
1122 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
1123  return vec_splats(from);
1124 }
1125 
1126 template <>
1127 EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1128  a3 = pload<Packet4f>(a);
1129  a0 = vec_splat(a3, 0);
1130  a1 = vec_splat(a3, 1);
1131  a2 = vec_splat(a3, 2);
1132  a3 = vec_splat(a3, 3);
1133 }
1134 
1135 template <>
1136 EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1137  EIGEN_ALIGN16 float af[4];
1138  af[0] = from[0 * stride];
1139  af[1] = from[1 * stride];
1140  af[2] = from[2 * stride];
1141  af[3] = from[3 * stride];
1142  return pload<Packet4f>(af);
1143 }
1144 
1145 template <>
1146 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1147  EIGEN_ALIGN16 float af[4];
1148  pstore<float>((float*)af, from);
1149  to[0 * stride] = af[0];
1150  to[1 * stride] = af[1];
1151  to[2 * stride] = af[2];
1152  to[3 * stride] = af[3];
1153 }
1154 
1155 template <>
1156 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1157  return (a + b);
1158 }
1159 template <>
1160 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1161  return (a - b);
1162 }
1163 template <>
1164 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1165  return (a * b);
1166 }
1167 template <>
1168 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1169  return (a / b);
1170 }
1171 template <>
1172 EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
1173  return (-a);
1174 }
1175 template <>
1176 EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
1177  return a;
1178 }
1179 template <>
1180 EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1181  return vec_madd(a, b, c);
1182 }
1183 template <>
1184 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1185  return vec_min(a, b);
1186 }
1187 template <>
1188 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1189  return vec_max(a, b);
1190 }
1191 template <>
1192 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1193  return vec_and(a, b);
1194 }
1195 template <>
1196 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1197  return vec_or(a, b);
1198 }
1199 template <>
1200 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1201  return vec_xor(a, b);
1202 }
1203 template <>
1204 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1205  return vec_and(a, vec_nor(b, b));
1206 }
1207 template <>
1208 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1209  /* Uses non-default rounding for vec_round */
1210  return __builtin_s390_vfisb(a, 0, 1);
1211 }
1212 template <>
1213 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1214  return vec_ceil(a);
1215 }
1216 template <>
1217 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1218  return vec_floor(a);
1219 }
1220 template <>
1221 EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
1222  return vec_abs(a);
1223 }
1224 template <>
1225 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1226  EIGEN_ALIGN16 float x[4];
1227  pstore(x, a);
1228  return x[0];
1229 }
1230 
1231 template <>
1232 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1233  Packet4f p = pload<Packet4f>(from);
1234  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1235 }
1236 
1237 template <>
1238 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1239  return reinterpret_cast<Packet4f>(
1240  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1241 }
1242 
1243 template <>
1244 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1245  Packet4f b, sum;
1246  b = vec_sld(a, a, 8);
1247  sum = padd<Packet4f>(a, b);
1248  b = vec_sld(sum, sum, 4);
1249  sum = padd<Packet4f>(sum, b);
1250  return pfirst(sum);
1251 }
1252 
1253 // Other reduction functions:
1254 // mul
1255 template <>
1256 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
1257  Packet4f prod;
1258  prod = pmul(a, vec_sld(a, a, 8));
1259  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1260 }
1261 
1262 // min
1263 template <>
1264 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
1265  Packet4f b, res;
1266  b = pmin<Packet4f>(a, vec_sld(a, a, 8));
1267  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
1268  return pfirst(res);
1269 }
1270 
1271 // max
1272 template <>
1273 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
1274  Packet4f b, res;
1275  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
1276  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
1277  return pfirst(res);
1278 }
1279 
1280 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1281  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1282  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1283  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1284  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1285  kernel.packet[0] = vec_mergeh(t0, t2);
1286  kernel.packet[1] = vec_mergel(t0, t2);
1287  kernel.packet[2] = vec_mergeh(t1, t3);
1288  kernel.packet[3] = vec_mergel(t1, t3);
1289 }
1290 
1291 template <>
1292 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
1293  const Packet4f& elsePacket) {
1294  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
1295  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
1296  return vec_sel(elsePacket, thenPacket, mask);
1297 }
1298 
1299 #endif
1300 
1301 template <>
1302 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1303  return pldexp_generic(a, exponent);
1304 }
1305 
1306 template <>
1307 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1308  // Clamp exponent to [-2099, 2099]
1309  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1310  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1311 
1312  // Split 2^e into four factors and multiply:
1313  const Packet2l bias = {1023, 1023};
1314  Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
1315  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
1316  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1317  b = psub(psub(psub(e, b), b), b); // e - 3b
1318  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
1319  out = pmul(out, c); // a * 2^e
1320  return out;
1321 }
1322 
1323 template <>
1324 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1325  EIGEN_ZVECTOR_PREFETCH(addr);
1326 }
1327 template <>
1328 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1329  return pload<Packet4f>(from);
1330 }
1331 template <>
1332 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1333  pstore<float>(to, from);
1334 }
1335 template <>
1336 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
1337  return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
1338 }
1339 
1340 #if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
1341 #pragma GCC warning \
1342  "float->int and int->float conversion is simulated. compile for z15 for improved performance"
1343 template <>
1344 struct cast_impl<Packet4i, Packet4f> {
1345  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
1346  return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3]) };
1347  }
1348 };
1349 
1350 template <>
1351 struct cast_impl<Packet4f, Packet4i> {
1352  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
1353  return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3]) };
1354  }
1355 };
1356 
1357 template <>
1358 struct cast_impl<Packet2l, Packet2d> {
1359  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
1360  return Packet2d{double(a[0]), double(a[1]) };
1361  }
1362 };
1363 
1364 template <>
1365 struct cast_impl<Packet2d, Packet2l> {
1366  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
1367  return Packet2l{(long long)(a[0]), (long long)(a[1]) };
1368  }
1369 };
1370 #else
1371 template <>
1372 struct cast_impl<Packet4i, Packet4f> {
1373  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
1374  return vec_float(a);
1375  }
1376 };
1377 
1378 template <>
1379 struct cast_impl<Packet4f, Packet4i> {
1380  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
1381  return vec_signed(a);
1382  }
1383 };
1384 
1385 template <>
1386 struct cast_impl<Packet2l, Packet2d> {
1387  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
1388  return vec_double(a);
1389  }
1390 };
1391 
1392 template <>
1393 struct cast_impl<Packet2d, Packet2l> {
1394  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
1395  return vec_signed(a);
1396  }
1397 };
1398 #endif
1399 
1400 template <>
1401 EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
1402  return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
1403 }
1404 template <>
1405 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
1406  return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
1407 }
1408 
1409 } // end namespace internal
1410 
1411 } // end namespace Eigen
1412 
1413 #endif // EIGEN_PACKET_MATH_ZVECTOR_H
Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82