$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11 #define EIGEN_PACKET_MATH_ALTIVEC_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
22 #endif
23 
24 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
26 #endif
27 
28 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
29 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
30 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
31 #endif
32 
33 typedef __vector float Packet4f;
34 typedef __vector int Packet4i;
35 typedef __vector unsigned int Packet4ui;
36 typedef __vector __bool int Packet4bi;
37 typedef __vector short int Packet8s;
38 typedef __vector unsigned short int Packet8us;
39 typedef __vector __bool short Packet8bi;
40 typedef __vector signed char Packet16c;
41 typedef __vector unsigned char Packet16uc;
42 typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
43 
44 // We don't want to write the same code all the time, but we need to reuse the constants
45 // and it doesn't really work to declare them global, so we define macros instead
46 #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
47 
48 #define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
49 
50 #define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
51 
52 #define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53 
54 #define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56 
57 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
58 
59 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
60 
61 #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
62 
63 #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
64 
65 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
66  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
67 
68 #define DST_CHAN 1
69 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
70 #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
71 
72 // These constants are endian-agnostic
73 static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
74 static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
75 static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
76 static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
77 static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
78 static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79 static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80 static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
81 static Packet4f p4f_MZERO =
82  (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
83 #ifndef __VSX__
84 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
85 #endif
86 
87 static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88 static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89 static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90 static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
91 
92 static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93 static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
94 
95 static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96 static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97 static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
98 
99 #ifdef _BIG_ENDIAN
100 static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
101 #endif
102 static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103 static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
104 
105 static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106 static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
107 
108 static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109 static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
110 #ifdef _BIG_ENDIAN
111 static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
112 #else
113 static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
114 #endif
115 
116 // Handle endianness properly while loading constants
117 // Define global static constants:
118 #ifdef _BIG_ENDIAN
119 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
120 static Packet16uc p16uc_PSET32_WODD =
121  vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
122  8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
123 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
124  8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
125 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
126  8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
127 #else
128 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
129 static Packet16uc p16uc_PSET32_WODD =
130  vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131  8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
132 static Packet16uc p16uc_PSET32_WEVEN =
133  vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
134  8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
135 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
136  8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
137 #endif // _BIG_ENDIAN
138 
139 static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140  (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
141 static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142  (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
143 static Packet16uc p16uc_TRANSPOSE64_HI =
144  p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
145 static Packet16uc p16uc_TRANSPOSE64_LO =
146  p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
147 
148 static Packet16uc p16uc_COMPLEX32_REV =
149  vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
150 
151 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
153 #else
154 #define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
155 #endif
156 
157 #if EIGEN_COMP_LLVM
158 #define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
159 #else
160 #define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
161 #endif
162 
163 template <>
164 struct packet_traits<float> : default_packet_traits {
165  typedef Packet4f type;
166  typedef Packet4f half;
167  enum {
168  Vectorizable = 1,
169  AlignedOnScalar = 1,
170  size = 4,
171 
172  HasAdd = 1,
173  HasSub = 1,
174  HasMul = 1,
175  HasDiv = 1,
176  HasMin = 1,
177  HasMax = 1,
178  HasAbs = 1,
179  HasSin = EIGEN_FAST_MATH,
180  HasCos = EIGEN_FAST_MATH,
181  HasACos = 1,
182  HasASin = 1,
183  HasATan = 1,
184  HasATanh = 1,
185  HasLog = 1,
186  HasExp = 1,
187 #ifdef EIGEN_VECTORIZE_VSX
188  HasCmp = 1,
189  HasPow = 1,
190  HasSqrt = 1,
191  HasCbrt = 1,
192 #if !EIGEN_COMP_CLANG
193  HasRsqrt = 1,
194 #else
195  HasRsqrt = 0,
196 #endif
197  HasTanh = EIGEN_FAST_MATH,
198  HasErf = EIGEN_FAST_MATH,
199  HasErfc = EIGEN_FAST_MATH,
200 #else
201  HasSqrt = 0,
202  HasRsqrt = 0,
203  HasTanh = 0,
204  HasErf = 0,
205 #endif
206  HasNegate = 1,
207  HasBlend = 1
208  };
209 };
210 template <>
211 struct packet_traits<bfloat16> : default_packet_traits {
212  typedef Packet8bf type;
213  typedef Packet8bf half;
214  enum {
215  Vectorizable = 1,
216  AlignedOnScalar = 1,
217  size = 8,
218 
219  HasAdd = 1,
220  HasSub = 1,
221  HasMul = 1,
222  HasDiv = 1,
223  HasMin = 1,
224  HasMax = 1,
225  HasAbs = 1,
226  HasSin = EIGEN_FAST_MATH,
227  HasCos = EIGEN_FAST_MATH,
228  HasLog = 1,
229  HasExp = 1,
230 #ifdef EIGEN_VECTORIZE_VSX
231  HasSqrt = 1,
232 #if !EIGEN_COMP_CLANG
233  HasRsqrt = 1,
234 #else
235  HasRsqrt = 0,
236 #endif
237 #else
238  HasSqrt = 0,
239  HasRsqrt = 0,
240 #endif
241  HasTanh = 0,
242  HasErf = 0,
243  HasNegate = 1,
244  HasBlend = 1
245  };
246 };
247 
248 template <>
249 struct packet_traits<int> : default_packet_traits {
250  typedef Packet4i type;
251  typedef Packet4i half;
252  enum {
253  Vectorizable = 1,
254  AlignedOnScalar = 1,
255  size = 4,
256 
257  HasAdd = 1,
258  HasSub = 1,
259  HasShift = 1,
260  HasMul = 1,
261 #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
262  HasDiv = 1,
263 #else
264  HasDiv = 0,
265 #endif
266  HasBlend = 1,
267  HasCmp = 1
268  };
269 };
270 
271 template <>
272 struct packet_traits<short int> : default_packet_traits {
273  typedef Packet8s type;
274  typedef Packet8s half;
275  enum {
276  Vectorizable = 1,
277  AlignedOnScalar = 1,
278  size = 8,
279 
280  HasAdd = 1,
281  HasSub = 1,
282  HasMul = 1,
283  HasDiv = 0,
284  HasBlend = 1,
285  HasCmp = 1
286  };
287 };
288 
289 template <>
290 struct packet_traits<unsigned short int> : default_packet_traits {
291  typedef Packet8us type;
292  typedef Packet8us half;
293  enum {
294  Vectorizable = 1,
295  AlignedOnScalar = 1,
296  size = 8,
297 
298  HasAdd = 1,
299  HasSub = 1,
300  HasMul = 1,
301  HasDiv = 0,
302  HasBlend = 1,
303  HasCmp = 1
304  };
305 };
306 
307 template <>
308 struct packet_traits<signed char> : default_packet_traits {
309  typedef Packet16c type;
310  typedef Packet16c half;
311  enum {
312  Vectorizable = 1,
313  AlignedOnScalar = 1,
314  size = 16,
315 
316  HasAdd = 1,
317  HasSub = 1,
318  HasMul = 1,
319  HasDiv = 0,
320  HasBlend = 1,
321  HasCmp = 1
322  };
323 };
324 
325 template <>
326 struct packet_traits<unsigned char> : default_packet_traits {
327  typedef Packet16uc type;
328  typedef Packet16uc half;
329  enum {
330  Vectorizable = 1,
331  AlignedOnScalar = 1,
332  size = 16,
333 
334  HasAdd = 1,
335  HasSub = 1,
336  HasMul = 1,
337  HasDiv = 0,
338  HasBlend = 1,
339  HasCmp = 1
340  };
341 };
342 
343 template <>
344 struct unpacket_traits<Packet4f> {
345  typedef float type;
346  typedef Packet4f half;
347  typedef Packet4i integer_packet;
348  enum {
349  size = 4,
350  alignment = Aligned16,
351  vectorizable = true,
352  masked_load_available = false,
353  masked_store_available = false
354  };
355 };
356 template <>
357 struct unpacket_traits<Packet4i> {
358  typedef int type;
359  typedef Packet4i half;
360  enum {
361  size = 4,
362  alignment = Aligned16,
363  vectorizable = true,
364  masked_load_available = false,
365  masked_store_available = false
366  };
367 };
368 template <>
369 struct unpacket_traits<Packet8s> {
370  typedef short int type;
371  typedef Packet8s half;
372  enum {
373  size = 8,
374  alignment = Aligned16,
375  vectorizable = true,
376  masked_load_available = false,
377  masked_store_available = false
378  };
379 };
380 template <>
381 struct unpacket_traits<Packet8us> {
382  typedef unsigned short int type;
383  typedef Packet8us half;
384  enum {
385  size = 8,
386  alignment = Aligned16,
387  vectorizable = true,
388  masked_load_available = false,
389  masked_store_available = false
390  };
391 };
392 
393 template <>
394 struct unpacket_traits<Packet16c> {
395  typedef signed char type;
396  typedef Packet16c half;
397  enum {
398  size = 16,
399  alignment = Aligned16,
400  vectorizable = true,
401  masked_load_available = false,
402  masked_store_available = false
403  };
404 };
405 template <>
406 struct unpacket_traits<Packet16uc> {
407  typedef unsigned char type;
408  typedef Packet16uc half;
409  enum {
410  size = 16,
411  alignment = Aligned16,
412  vectorizable = true,
413  masked_load_available = false,
414  masked_store_available = false
415  };
416 };
417 
418 template <>
419 struct unpacket_traits<Packet8bf> {
420  typedef bfloat16 type;
421  typedef Packet8bf half;
422  enum {
423  size = 8,
424  alignment = Aligned16,
425  vectorizable = true,
426  masked_load_available = false,
427  masked_store_available = false
428  };
429 };
430 
431 template <typename Packet>
432 EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
433  // some versions of GCC throw "unused-but-set-parameter".
434  // ignoring these warnings for now.
435  EIGEN_UNUSED_VARIABLE(from);
436  EIGEN_DEBUG_ALIGNED_LOAD
437 #ifdef EIGEN_VECTORIZE_VSX
438  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
439 #else
440  return vec_ld(0, from);
441 #endif
442 }
443 
444 // Need to define them first or we get specialization after instantiation errors
445 template <>
446 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
447  return pload_common<Packet4f>(from);
448 }
449 
450 template <>
451 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
452  return pload_common<Packet4i>(from);
453 }
454 
455 template <>
456 EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
457  return pload_common<Packet8s>(from);
458 }
459 
460 template <>
461 EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
462  return pload_common<Packet8us>(from);
463 }
464 
465 template <>
466 EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
467  return pload_common<Packet16c>(from);
468 }
469 
470 template <>
471 EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
472  return pload_common<Packet16uc>(from);
473 }
474 
475 template <>
476 EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
477  return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
478 }
479 
480 template <typename Packet>
481 EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
482  // some versions of GCC throw "unused-but-set-parameter".
483  // ignoring these warnings for now.
484  EIGEN_UNUSED_VARIABLE(from);
485  EIGEN_DEBUG_ALIGNED_LOAD
486  // Ignore partial input memory initialized
487 #if !EIGEN_COMP_LLVM
488 #pragma GCC diagnostic push
489 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
490 #endif
491 #ifdef EIGEN_VECTORIZE_VSX
492  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
493 #else
494  return vec_ld(0, from);
495 #endif
496 #if !EIGEN_COMP_LLVM
497 #pragma GCC diagnostic pop
498 #endif
499 }
500 
501 template <>
502 EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
503  return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
504 }
505 
506 template <typename Packet>
507 EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
508  const Index offset) {
509  // some versions of GCC throw "unused-but-set-parameter".
510  // ignoring these warnings for now.
511  const Index packet_size = unpacket_traits<Packet>::size;
512  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
513  const Index size = sizeof(__UNPACK_TYPE__(Packet));
514 #ifdef _ARCH_PWR9
515  EIGEN_UNUSED_VARIABLE(packet_size);
516  EIGEN_DEBUG_ALIGNED_LOAD
517  EIGEN_UNUSED_VARIABLE(from);
518  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
519  if (offset) {
520  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
521 #ifdef _BIG_ENDIAN
522  load = Packet(vec_sro(Packet16uc(load), shift));
523 #else
524  load = Packet(vec_slo(Packet16uc(load), shift));
525 #endif
526  }
527  return load;
528 #else
529  if (n) {
530  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
531  unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
532  unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
533  Index n2 = n * size;
534  if (16 <= n2) {
535  pstoreu(load2, ploadu<Packet16uc>(from2));
536  } else {
537  memcpy((void*)load2, (void*)from2, n2);
538  }
539  return pload_ignore<Packet>(load);
540  } else {
541  return Packet(pset1<Packet16uc>(0));
542  }
543 #endif
544 }
545 
546 template <>
547 EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
548  return pload_partial_common<Packet4f>(from, n, offset);
549 }
550 
551 template <>
552 EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
553  return pload_partial_common<Packet4i>(from, n, offset);
554 }
555 
556 template <>
557 EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
558  return pload_partial_common<Packet8s>(from, n, offset);
559 }
560 
561 template <>
562 EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
563  const Index offset) {
564  return pload_partial_common<Packet8us>(from, n, offset);
565 }
566 
567 template <>
568 EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
569  return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
570 }
571 
572 template <>
573 EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
574  return pload_partial_common<Packet16c>(from, n, offset);
575 }
576 
577 template <>
578 EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
579  return pload_partial_common<Packet16uc>(from, n, offset);
580 }
581 
582 template <typename Packet>
583 EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
584  // some versions of GCC throw "unused-but-set-parameter" (float *to).
585  // ignoring these warnings for now.
586  EIGEN_UNUSED_VARIABLE(to);
587  EIGEN_DEBUG_ALIGNED_STORE
588 #ifdef EIGEN_VECTORIZE_VSX
589  vec_xst(from, 0, to);
590 #else
591  vec_st(from, 0, to);
592 #endif
593 }
594 
595 template <>
596 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
597  pstore_common<Packet4f>(to, from);
598 }
599 
600 template <>
601 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
602  pstore_common<Packet4i>(to, from);
603 }
604 
605 template <>
606 EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
607  pstore_common<Packet8s>(to, from);
608 }
609 
610 template <>
611 EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
612  pstore_common<Packet8us>(to, from);
613 }
614 
615 template <>
616 EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
617  pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
618 }
619 
620 template <>
621 EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
622  pstore_common<Packet16c>(to, from);
623 }
624 
625 template <>
626 EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
627  pstore_common<Packet16uc>(to, from);
628 }
629 
630 template <typename Packet>
631 EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
632  const Index offset) {
633  // some versions of GCC throw "unused-but-set-parameter" (float *to).
634  // ignoring these warnings for now.
635  const Index packet_size = unpacket_traits<Packet>::size;
636  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
637  const Index size = sizeof(__UNPACK_TYPE__(Packet));
638 #ifdef _ARCH_PWR9
639  EIGEN_UNUSED_VARIABLE(packet_size);
640  EIGEN_UNUSED_VARIABLE(to);
641  EIGEN_DEBUG_ALIGNED_STORE
642  Packet store = from;
643  if (offset) {
644  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
645 #ifdef _BIG_ENDIAN
646  store = Packet(vec_slo(Packet16uc(store), shift));
647 #else
648  store = Packet(vec_sro(Packet16uc(store), shift));
649 #endif
650  }
651  vec_xst_len(store, to, n * size);
652 #else
653  if (n) {
654  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
655  pstore(store, from);
656  unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
657  unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
658  Index n2 = n * size;
659  if (16 <= n2) {
660  pstore(to2, ploadu<Packet16uc>(store2));
661  } else {
662  memcpy((void*)to2, (void*)store2, n2);
663  }
664  }
665 #endif
666 }
667 
668 template <>
669 EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
670  pstore_partial_common<Packet4f>(to, from, n, offset);
671 }
672 
673 template <>
674 EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
675  pstore_partial_common<Packet4i>(to, from, n, offset);
676 }
677 
678 template <>
679 EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
680  const Index offset) {
681  pstore_partial_common<Packet8s>(to, from, n, offset);
682 }
683 
684 template <>
685 EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
686  const Index n, const Index offset) {
687  pstore_partial_common<Packet8us>(to, from, n, offset);
688 }
689 
690 template <>
691 EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
692  const Index offset) {
693  pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
694 }
695 
696 template <>
697 EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
698  const Index offset) {
699  pstore_partial_common<Packet16c>(to, from, n, offset);
700 }
701 
702 template <>
703 EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
704  const Index offset) {
705  pstore_partial_common<Packet16uc>(to, from, n, offset);
706 }
707 
708 template <typename Packet>
709 EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
710  Packet v = {from, from, from, from};
711  return v;
712 }
713 
714 template <typename Packet>
715 EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
716  Packet v = {from, from, from, from, from, from, from, from};
717  return v;
718 }
719 
720 template <typename Packet>
721 EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
722  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
723  return v;
724 }
725 
726 template <>
727 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
728  return pset1_size4<Packet4f>(from);
729 }
730 
731 template <>
732 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
733  return pset1_size4<Packet4i>(from);
734 }
735 
736 template <>
737 EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
738  return pset1_size8<Packet8s>(from);
739 }
740 
741 template <>
742 EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
743  return pset1_size8<Packet8us>(from);
744 }
745 
746 template <>
747 EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
748  return pset1_size16<Packet16c>(from);
749 }
750 
751 template <>
752 EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
753  return pset1_size16<Packet16uc>(from);
754 }
755 
756 template <>
757 EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
758  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
759 }
760 
761 template <>
762 EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
763  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
764 }
765 
766 template <typename Packet>
767 EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
768  Packet& a3) {
769  a3 = pload<Packet>(a);
770  a0 = vec_splat(a3, 0);
771  a1 = vec_splat(a3, 1);
772  a2 = vec_splat(a3, 2);
773  a3 = vec_splat(a3, 3);
774 }
775 
776 template <>
777 EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
778  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
779 }
780 template <>
781 EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
782  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
783 }
784 
785 template <typename Packet>
786 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
787  const Index n = unpacket_traits<Packet>::size) {
788  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
789  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
790  if (stride == 1) {
791  if (n == unpacket_traits<Packet>::size) {
792  return ploadu<Packet>(from);
793  } else {
794  return ploadu_partial<Packet>(from, n);
795  }
796  } else {
797  LOAD_STORE_UNROLL_16
798  for (Index i = 0; i < n; i++) {
799  a[i] = from[i * stride];
800  }
801  // Leave rest of the array uninitialized
802  return pload_ignore<Packet>(a);
803  }
804 }
805 
806 template <>
807 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
808  return pgather_common<Packet4f>(from, stride);
809 }
810 
811 template <>
812 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
813  return pgather_common<Packet4i>(from, stride);
814 }
815 
816 template <>
817 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
818  return pgather_common<Packet8s>(from, stride);
819 }
820 
821 template <>
822 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
823  Index stride) {
824  return pgather_common<Packet8us>(from, stride);
825 }
826 
827 template <>
828 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
829  return pgather_common<Packet8bf>(from, stride);
830 }
831 
832 template <>
833 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
834  return pgather_common<Packet16c>(from, stride);
835 }
836 
837 template <>
838 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
839  Index stride) {
840  return pgather_common<Packet16uc>(from, stride);
841 }
842 
843 template <>
844 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
845  const Index n) {
846  return pgather_common<Packet4f>(from, stride, n);
847 }
848 
849 template <>
850 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
851  const Index n) {
852  return pgather_common<Packet4i>(from, stride, n);
853 }
854 
855 template <>
856 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
857  const Index n) {
858  return pgather_common<Packet8s>(from, stride, n);
859 }
860 
861 template <>
862 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
863 pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
864  return pgather_common<Packet8us>(from, stride, n);
865 }
866 
867 template <>
868 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
869  const Index n) {
870  return pgather_common<Packet8bf>(from, stride, n);
871 }
872 
873 template <>
874 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
875  Index stride, const Index n) {
876  return pgather_common<Packet16c>(from, stride, n);
877 }
878 
879 template <>
880 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
881  Index stride,
882  const Index n) {
883  return pgather_common<Packet16uc>(from, stride, n);
884 }
885 
886 template <typename Packet>
887 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
888  Index stride,
889  const Index n = unpacket_traits<Packet>::size) {
890  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
891  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
892  if (stride == 1) {
893  if (n == unpacket_traits<Packet>::size) {
894  return pstoreu(to, from);
895  } else {
896  return pstoreu_partial(to, from, n);
897  }
898  } else {
899  pstore<__UNPACK_TYPE__(Packet)>(a, from);
900  LOAD_STORE_UNROLL_16
901  for (Index i = 0; i < n; i++) {
902  to[i * stride] = a[i];
903  }
904  }
905 }
906 
907 template <>
908 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
909  pscatter_common<Packet4f>(to, from, stride);
910 }
911 
912 template <>
913 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
914  pscatter_common<Packet4i>(to, from, stride);
915 }
916 
917 template <>
918 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
919  Index stride) {
920  pscatter_common<Packet8s>(to, from, stride);
921 }
922 
923 template <>
924 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
925  const Packet8us& from,
926  Index stride) {
927  pscatter_common<Packet8us>(to, from, stride);
928 }
929 
930 template <>
931 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
932  Index stride) {
933  pscatter_common<Packet8bf>(to, from, stride);
934 }
935 
936 template <>
937 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
938  Index stride) {
939  pscatter_common<Packet16c>(to, from, stride);
940 }
941 
942 template <>
943 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
944  const Packet16uc& from, Index stride) {
945  pscatter_common<Packet16uc>(to, from, stride);
946 }
947 
948 template <>
949 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
950  Index stride, const Index n) {
951  pscatter_common<Packet4f>(to, from, stride, n);
952 }
953 
954 template <>
955 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
956  const Index n) {
957  pscatter_common<Packet4i>(to, from, stride, n);
958 }
959 
960 template <>
961 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
962  Index stride, const Index n) {
963  pscatter_common<Packet8s>(to, from, stride, n);
964 }
965 
966 template <>
967 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
968  const Packet8us& from,
969  Index stride,
970  const Index n) {
971  pscatter_common<Packet8us>(to, from, stride, n);
972 }
973 
974 template <>
975 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
976  Index stride, const Index n) {
977  pscatter_common<Packet8bf>(to, from, stride, n);
978 }
979 
980 template <>
981 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
982  const Packet16c& from, Index stride,
983  const Index n) {
984  pscatter_common<Packet16c>(to, from, stride, n);
985 }
986 
987 template <>
988 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
989  const Packet16uc& from,
990  Index stride, const Index n) {
991  pscatter_common<Packet16uc>(to, from, stride, n);
992 }
993 
994 template <>
995 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
996  return pset1<Packet4f>(a) + p4f_COUNTDOWN;
997 }
998 template <>
999 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
1000  return pset1<Packet4i>(a) + p4i_COUNTDOWN;
1001 }
1002 template <>
1003 EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
1004  return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1005 }
1006 template <>
1007 EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
1008  return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1009 }
1010 template <>
1011 EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
1012  return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1013 }
1014 template <>
1015 EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
1016  return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1017 }
1018 
1019 template <>
1020 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1021  return a + b;
1022 }
1023 template <>
1024 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
1025  return a + b;
1026 }
1027 template <>
1028 EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1029  return a + b;
1030 }
1031 template <>
1032 EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
1033  return a + b;
1034 }
1035 template <>
1036 EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
1037  return a + b;
1038 }
1039 template <>
1040 EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
1041  return a + b;
1042 }
1043 template <>
1044 EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1045  return a + b;
1046 }
1047 
1048 template <>
1049 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1050  return a - b;
1051 }
1052 template <>
1053 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
1054  return a - b;
1055 }
1056 template <>
1057 EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
1058  return a - b;
1059 }
1060 template <>
1061 EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
1062  return a - b;
1063 }
1064 template <>
1065 EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
1066  return a - b;
1067 }
1068 template <>
1069 EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1070  return a - b;
1071 }
1072 
1073 template <>
1074 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
1075 #ifdef __POWER8_VECTOR__
1076  return vec_neg(a);
1077 #else
1078  return vec_xor(a, p4f_MZERO);
1079 #endif
1080 }
1081 template <>
1082 EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
1083 #ifdef __POWER8_VECTOR__
1084  return vec_neg(a);
1085 #else
1086  return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
1087 #endif
1088 }
1089 template <>
1090 EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
1091 #ifdef __POWER8_VECTOR__
1092  return vec_neg(a);
1093 #else
1094  return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
1095 #endif
1096 }
1097 template <>
1098 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
1099 #ifdef __POWER8_VECTOR__
1100  return vec_neg(a);
1101 #else
1102  return p4i_ZERO - a;
1103 #endif
1104 }
1105 
1106 template <>
1107 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
1108  return a;
1109 }
1110 template <>
1111 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
1112  return a;
1113 }
1114 
1115 template <>
1116 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1117  return vec_madd(a, b, p4f_MZERO);
1118 }
1119 template <>
1120 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1121  return a * b;
1122 }
1123 template <>
1124 EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1125  return vec_mul(a, b);
1126 }
1127 template <>
1128 EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1129  return vec_mul(a, b);
1130 }
1131 template <>
1132 EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
1133  return vec_mul(a, b);
1134 }
1135 template <>
1136 EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1137  return vec_mul(a, b);
1138 }
1139 
1140 template <>
1141 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1142 #ifndef __VSX__ // VSX actually provides a div instruction
1143  Packet4f t, y_0, y_1;
1144 
1145  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
1146  y_0 = vec_re(b);
1147 
1148  // Do one Newton-Raphson iteration to get the needed accuracy
1149  t = vec_nmsub(y_0, b, p4f_ONE);
1150  y_1 = vec_madd(y_0, t, y_0);
1151 
1152  return vec_madd(a, y_1, p4f_MZERO);
1153 #else
1154  return vec_div(a, b);
1155 #endif
1156 }
1157 
1158 template <>
1159 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
1160 #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1161  return vec_div(a, b);
1162 #else
1163  EIGEN_UNUSED_VARIABLE(a);
1164  EIGEN_UNUSED_VARIABLE(b);
1165  eigen_assert(false && "packet integer division are not supported by AltiVec");
1166  return pset1<Packet4i>(0);
1167 #endif
1168 }
1169 
1170 // for some weird raisons, it has to be overloaded for packet of integers
1171 template <>
1172 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1173  return vec_madd(a, b, c);
1174 }
1175 template <>
1176 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1177  return a * b + c;
1178 }
1179 template <>
1180 EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1181  return vec_madd(a, b, c);
1182 }
1183 template <>
1184 EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1185  return vec_madd(a, b, c);
1186 }
1187 
1188 #ifdef EIGEN_VECTORIZE_VSX
1189 template <>
1190 EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1191  return vec_msub(a, b, c);
1192 }
1193 template <>
1194 EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1195  return vec_nmsub(a, b, c);
1196 }
1197 template <>
1198 EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1199  return vec_nmadd(a, b, c);
1200 }
1201 #endif
1202 
1203 template <>
1204 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1205 #ifdef EIGEN_VECTORIZE_VSX
1206  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
1207  Packet4f ret;
1208  __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1209  return ret;
1210 #else
1211  return vec_min(a, b);
1212 #endif
1213 }
1214 template <>
1215 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1216  return vec_min(a, b);
1217 }
1218 template <>
1219 EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1220  return vec_min(a, b);
1221 }
1222 template <>
1223 EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1224  return vec_min(a, b);
1225 }
1226 template <>
1227 EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1228  return vec_min(a, b);
1229 }
1230 template <>
1231 EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1232  return vec_min(a, b);
1233 }
1234 
1235 template <>
1236 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1237 #ifdef EIGEN_VECTORIZE_VSX
1238  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
1239  Packet4f ret;
1240  __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1241  return ret;
1242 #else
1243  return vec_max(a, b);
1244 #endif
1245 }
1246 template <>
1247 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1248  return vec_max(a, b);
1249 }
1250 template <>
1251 EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1252  return vec_max(a, b);
1253 }
1254 template <>
1255 EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1256  return vec_max(a, b);
1257 }
1258 template <>
1259 EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1260  return vec_max(a, b);
1261 }
1262 template <>
1263 EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1264  return vec_max(a, b);
1265 }
1266 
1267 template <>
1268 EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
1269  return reinterpret_cast<Packet4f>(vec_cmple(a, b));
1270 }
1271 // To fix bug with vec_cmplt on older versions
1272 #ifdef EIGEN_VECTORIZE_VSX
1273 template <>
1274 EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
1275  return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
1276 }
1277 #endif
1278 template <>
1279 EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
1280  return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
1281 }
1282 template <>
1283 EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
1284  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
1285  return vec_nor(c, c);
1286 }
1287 
1288 #ifdef EIGEN_VECTORIZE_VSX
1289 template <>
1290 EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
1291  return reinterpret_cast<Packet4i>(vec_cmple(a, b));
1292 }
1293 #endif
1294 template <>
1295 EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
1296  return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
1297 }
1298 template <>
1299 EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
1300  return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
1301 }
1302 #ifdef EIGEN_VECTORIZE_VSX
1303 template <>
1304 EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
1305  return reinterpret_cast<Packet8s>(vec_cmple(a, b));
1306 }
1307 #endif
1308 template <>
1309 EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
1310  return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
1311 }
1312 template <>
1313 EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
1314  return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
1315 }
1316 #ifdef EIGEN_VECTORIZE_VSX
1317 template <>
1318 EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
1319  return reinterpret_cast<Packet8us>(vec_cmple(a, b));
1320 }
1321 #endif
1322 template <>
1323 EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
1324  return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
1325 }
1326 template <>
1327 EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
1328  return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
1329 }
1330 #ifdef EIGEN_VECTORIZE_VSX
1331 template <>
1332 EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
1333  return reinterpret_cast<Packet16c>(vec_cmple(a, b));
1334 }
1335 #endif
1336 template <>
1337 EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
1338  return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
1339 }
1340 template <>
1341 EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
1342  return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
1343 }
1344 #ifdef EIGEN_VECTORIZE_VSX
1345 template <>
1346 EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
1347  return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
1348 }
1349 #endif
1350 template <>
1351 EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
1352  return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
1353 }
1354 template <>
1355 EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
1356  return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
1357 }
1358 
1359 template <>
1360 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1361  return vec_and(a, b);
1362 }
1363 template <>
1364 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1365  return vec_and(a, b);
1366 }
1367 template <>
1368 EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1369  return vec_and(a, b);
1370 }
1371 template <>
1372 EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1373  return vec_and(a, b);
1374 }
1375 template <>
1376 EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1377  return pand<Packet8us>(a, b);
1378 }
1379 
1380 template <>
1381 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1382  return vec_or(a, b);
1383 }
1384 template <>
1385 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1386  return vec_or(a, b);
1387 }
1388 template <>
1389 EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1390  return vec_or(a, b);
1391 }
1392 template <>
1393 EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1394  return vec_or(a, b);
1395 }
1396 template <>
1397 EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1398  return por<Packet8us>(a, b);
1399 }
1400 
1401 template <>
1402 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1403  return vec_xor(a, b);
1404 }
1405 template <>
1406 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1407  return vec_xor(a, b);
1408 }
1409 template <>
1410 EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1411  return vec_xor(a, b);
1412 }
1413 template <>
1414 EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1415  return pxor<Packet8us>(a, b);
1416 }
1417 
1418 template <>
1419 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1420  return vec_andc(a, b);
1421 }
1422 template <>
1423 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1424  return vec_andc(a, b);
1425 }
1426 
1427 template <>
1428 EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
1429  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
1430 }
1431 
1432 template <>
1433 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1434  Packet4f t = vec_add(
1435  reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1436  Packet4f res;
1437 
1438 #ifdef EIGEN_VECTORIZE_VSX
1439  __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
1440 #else
1441  __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
1442 #endif
1443 
1444  return res;
1445 }
1446 template <>
1447 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1448  return vec_ceil(a);
1449 }
1450 template <>
1451 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1452  return vec_floor(a);
1453 }
1454 template <>
1455 EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1456  return vec_trunc(a);
1457 }
1458 #ifdef EIGEN_VECTORIZE_VSX
1459 template <>
1460 EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1461  Packet4f res;
1462 
1463  __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
1464 
1465  return res;
1466 }
1467 #endif
1468 
1469 template <typename Packet>
1470 EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
1471  EIGEN_DEBUG_UNALIGNED_LOAD
1472 #if defined(EIGEN_VECTORIZE_VSX)
1473  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1474 #else
1475  Packet16uc MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
1476  Packet16uc LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
1477  Packet16uc mask = vec_lvsl(0, from); // create the permute mask
1478  // TODO: Add static_cast here
1479  return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
1480 #endif
1481 }
1482 
1483 template <>
1484 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1485  return ploadu_common<Packet4f>(from);
1486 }
1487 template <>
1488 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1489  return ploadu_common<Packet4i>(from);
1490 }
1491 template <>
1492 EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
1493  return ploadu_common<Packet8s>(from);
1494 }
1495 template <>
1496 EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
1497  return ploadu_common<Packet8us>(from);
1498 }
1499 template <>
1500 EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
1501  return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1502 }
1503 template <>
1504 EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
1505  return ploadu_common<Packet16c>(from);
1506 }
1507 template <>
1508 EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
1509  return ploadu_common<Packet16uc>(from);
1510 }
1511 
1512 template <typename Packet>
1513 EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
1514  const Index offset) {
1515  const Index packet_size = unpacket_traits<Packet>::size;
1516  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
1517  const Index size = sizeof(__UNPACK_TYPE__(Packet));
1518 #ifdef _ARCH_PWR9
1519  EIGEN_UNUSED_VARIABLE(packet_size);
1520  EIGEN_DEBUG_ALIGNED_LOAD
1521  EIGEN_DEBUG_UNALIGNED_LOAD
1522  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
1523  if (offset) {
1524  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1525 #ifdef _BIG_ENDIAN
1526  load = Packet(vec_sro(Packet16uc(load), shift));
1527 #else
1528  load = Packet(vec_slo(Packet16uc(load), shift));
1529 #endif
1530  }
1531  return load;
1532 #else
1533  if (n) {
1534  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1535  unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
1536  unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
1537  Index n2 = n * size;
1538  if (16 <= n2) {
1539  pstoreu(load2, ploadu<Packet16uc>(from2));
1540  } else {
1541  memcpy((void*)load2, (void*)from2, n2);
1542  }
1543  return pload_ignore<Packet>(load);
1544  } else {
1545  return Packet(pset1<Packet16uc>(0));
1546  }
1547 #endif
1548 }
1549 
1550 template <>
1551 EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
1552  return ploadu_partial_common<Packet4f>(from, n, offset);
1553 }
1554 template <>
1555 EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
1556  return ploadu_partial_common<Packet4i>(from, n, offset);
1557 }
1558 template <>
1559 EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
1560  return ploadu_partial_common<Packet8s>(from, n, offset);
1561 }
1562 template <>
1563 EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
1564  const Index offset) {
1565  return ploadu_partial_common<Packet8us>(from, n, offset);
1566 }
1567 template <>
1568 EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
1569  return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
1570 }
1571 template <>
1572 EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
1573  return ploadu_partial_common<Packet16c>(from, n, offset);
1574 }
1575 template <>
1576 EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
1577  const Index offset) {
1578  return ploadu_partial_common<Packet16uc>(from, n, offset);
1579 }
1580 
1581 template <typename Packet>
1582 EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
1583  Packet p;
1584  if ((std::ptrdiff_t(from) % 16) == 0)
1585  p = pload<Packet>(from);
1586  else
1587  p = ploadu<Packet>(from);
1588  return vec_mergeh(p, p);
1589 }
1590 template <>
1591 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1592  return ploaddup_common<Packet4f>(from);
1593 }
1594 template <>
1595 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1596  return ploaddup_common<Packet4i>(from);
1597 }
1598 
1599 template <>
1600 EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
1601  Packet8s p;
1602  if ((std::ptrdiff_t(from) % 16) == 0)
1603  p = pload<Packet8s>(from);
1604  else
1605  p = ploadu<Packet8s>(from);
1606  return vec_mergeh(p, p);
1607 }
1608 
1609 template <>
1610 EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
1611  Packet8us p;
1612  if ((std::ptrdiff_t(from) % 16) == 0)
1613  p = pload<Packet8us>(from);
1614  else
1615  p = ploadu<Packet8us>(from);
1616  return vec_mergeh(p, p);
1617 }
1618 
1619 template <>
1620 EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
1621  Packet8s p;
1622  if ((std::ptrdiff_t(from) % 16) == 0)
1623  p = pload<Packet8s>(from);
1624  else
1625  p = ploadu<Packet8s>(from);
1626  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1627 }
1628 
1629 template <>
1630 EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
1631  Packet8us p;
1632  if ((std::ptrdiff_t(from) % 16) == 0)
1633  p = pload<Packet8us>(from);
1634  else
1635  p = ploadu<Packet8us>(from);
1636  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1637 }
1638 
1639 template <>
1640 EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
1641  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1642 }
1643 
1644 template <>
1645 EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
1646  Packet16c p;
1647  if ((std::ptrdiff_t(from) % 16) == 0)
1648  p = pload<Packet16c>(from);
1649  else
1650  p = ploadu<Packet16c>(from);
1651  return vec_mergeh(p, p);
1652 }
1653 
1654 template <>
1655 EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
1656  Packet16uc p;
1657  if ((std::ptrdiff_t(from) % 16) == 0)
1658  p = pload<Packet16uc>(from);
1659  else
1660  p = ploadu<Packet16uc>(from);
1661  return vec_mergeh(p, p);
1662 }
1663 
1664 template <>
1665 EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
1666  Packet16c p;
1667  if ((std::ptrdiff_t(from) % 16) == 0)
1668  p = pload<Packet16c>(from);
1669  else
1670  p = ploadu<Packet16c>(from);
1671  return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1672 }
1673 
1674 template <>
1675 EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
1676  Packet16uc p;
1677  if ((std::ptrdiff_t(from) % 16) == 0)
1678  p = pload<Packet16uc>(from);
1679  else
1680  p = ploadu<Packet16uc>(from);
1681  return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1682 }
1683 
1684 template <typename Packet>
1685 EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
1686  EIGEN_DEBUG_UNALIGNED_STORE
1687 #if defined(EIGEN_VECTORIZE_VSX)
1688  vec_xst(from, 0, to);
1689 #else
1690  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1691  // Warning: not thread safe!
1692  Packet16uc MSQ, LSQ, edges;
1693  Packet16uc edgeAlign, align;
1694 
1695  MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
1696  LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
1697  edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1698  edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
1699  align = vec_lvsr(0, to); // permute map to misalign data
1700  MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
1701  LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
1702  vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
1703  vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
1704 #endif
1705 }
1706 template <>
1707 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1708  pstoreu_common<Packet4f>(to, from);
1709 }
1710 template <>
1711 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1712  pstoreu_common<Packet4i>(to, from);
1713 }
1714 template <>
1715 EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
1716  pstoreu_common<Packet8s>(to, from);
1717 }
1718 template <>
1719 EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
1720  pstoreu_common<Packet8us>(to, from);
1721 }
1722 template <>
1723 EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
1724  pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
1725 }
1726 template <>
1727 EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
1728  pstoreu_common<Packet16c>(to, from);
1729 }
1730 template <>
1731 EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
1732  pstoreu_common<Packet16uc>(to, from);
1733 }
1734 
1735 template <typename Packet>
1736 EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
1737  const Index offset) {
1738  const Index packet_size = unpacket_traits<Packet>::size;
1739  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
1740  const Index size = sizeof(__UNPACK_TYPE__(Packet));
1741 #ifdef _ARCH_PWR9
1742  EIGEN_UNUSED_VARIABLE(packet_size);
1743  EIGEN_DEBUG_UNALIGNED_STORE
1744  Packet store = from;
1745  if (offset) {
1746  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1747 #ifdef _BIG_ENDIAN
1748  store = Packet(vec_slo(Packet16uc(store), shift));
1749 #else
1750  store = Packet(vec_sro(Packet16uc(store), shift));
1751 #endif
1752  }
1753  vec_xst_len(store, to, n * size);
1754 #else
1755  if (n) {
1756  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1757  pstore(store, from);
1758  unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
1759  unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
1760  Index n2 = n * size;
1761  if (16 <= n2) {
1762  pstoreu(to2, ploadu<Packet16uc>(store2));
1763  } else {
1764  memcpy((void*)to2, (void*)store2, n2);
1765  }
1766  }
1767 #endif
1768 }
1769 
1770 template <>
1771 EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
1772  pstoreu_partial_common<Packet4f>(to, from, n, offset);
1773 }
1774 template <>
1775 EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
1776  pstoreu_partial_common<Packet4i>(to, from, n, offset);
1777 }
1778 template <>
1779 EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
1780  const Index offset) {
1781  pstoreu_partial_common<Packet8s>(to, from, n, offset);
1782 }
1783 template <>
1784 EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
1785  const Index n, const Index offset) {
1786  pstoreu_partial_common<Packet8us>(to, from, n, offset);
1787 }
1788 template <>
1789 EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
1790  const Index offset) {
1791  pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
1792 }
1793 template <>
1794 EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
1795  const Index offset) {
1796  pstoreu_partial_common<Packet16c>(to, from, n, offset);
1797 }
1798 template <>
1799 EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
1800  const Index offset) {
1801  pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1802 }
1803 
1804 template <>
1805 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1806  EIGEN_PPC_PREFETCH(addr);
1807 }
1808 template <>
1809 EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1810  EIGEN_PPC_PREFETCH(addr);
1811 }
1812 
1813 template <>
1814 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1815  EIGEN_ALIGN16 float x;
1816  vec_ste(a, 0, &x);
1817  return x;
1818 }
1819 template <>
1820 EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1821  EIGEN_ALIGN16 int x;
1822  vec_ste(a, 0, &x);
1823  return x;
1824 }
1825 
1826 template <typename Packet>
1827 EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1828  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1829  vec_ste(a, 0, &x);
1830  return x;
1831 }
1832 
1833 template <>
1834 EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1835  return pfirst_common<Packet8s>(a);
1836 }
1837 
1838 template <>
1839 EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1840  return pfirst_common<Packet8us>(a);
1841 }
1842 
1843 template <>
1844 EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
1845  return pfirst_common<Packet16c>(a);
1846 }
1847 
1848 template <>
1849 EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
1850  return pfirst_common<Packet16uc>(a);
1851 }
1852 
1853 template <>
1854 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1855  return reinterpret_cast<Packet4f>(
1856  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1857 }
1858 template <>
1859 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1860  return reinterpret_cast<Packet4i>(
1861  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1862 }
1863 template <>
1864 EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1865  return reinterpret_cast<Packet8s>(
1866  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1867 }
1868 template <>
1869 EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1870  return reinterpret_cast<Packet8us>(
1871  vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1872 }
1873 template <>
1874 EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1875  return vec_perm(a, a, p16uc_REVERSE8);
1876 }
1877 template <>
1878 EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1879  return vec_perm(a, a, p16uc_REVERSE8);
1880 }
1881 template <>
1882 EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
1883  return preverse<Packet8us>(a);
1884 }
1885 
1886 template <>
1887 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1888  return vec_abs(a);
1889 }
1890 template <>
1891 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1892  return vec_abs(a);
1893 }
1894 template <>
1895 EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1896  return vec_abs(a);
1897 }
1898 template <>
1899 EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1900  return a;
1901 }
1902 template <>
1903 EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1904  return vec_abs(a);
1905 }
1906 template <>
1907 EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1908  return a;
1909 }
1910 template <>
1911 EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1912  EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1913  return pand<Packet8us>(p8us_abs_mask, a);
1914 }
1915 
1916 template <>
1917 EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
1918  return vec_sra(a.m_val, vec_splat_u16(15));
1919 }
1920 template <>
1921 EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1922  return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
1923 }
1924 
1925 template <int N>
1926 EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1927  return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1928 }
1929 template <int N>
1930 EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1931  return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1932 }
1933 template <int N>
1934 EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1935  return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1936 }
1937 template <int N>
1938 EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
1939  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1940  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1941  return reinterpret_cast<Packet4f>(r);
1942 }
1943 
1944 template <int N>
1945 EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
1946  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1947  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1948  return reinterpret_cast<Packet4f>(r);
1949 }
1950 
1951 template <int N>
1952 EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1953  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1954  return vec_sr(a, p4ui_mask);
1955 }
1956 
1957 template <int N>
1958 EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1959  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1960  return vec_sl(a, p4ui_mask);
1961 }
1962 
1963 template <int N>
1964 EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1965  const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1966  return vec_sl(a, p8us_mask);
1967 }
1968 template <int N>
1969 EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1970  const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1971  return vec_sr(a, p8us_mask);
1972 }
1973 
1974 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
1975  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1976 }
1977 
1978 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
1979  const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1980  return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
1981 }
1982 
1983 EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1984 #ifdef _BIG_ENDIAN
1985  return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
1986 #else
1987  return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
1988 #endif
1989 }
1990 
1991 // Simple interleaving of bool masks, prevents true values from being
1992 // converted to NaNs.
1993 EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1994  return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
1995 }
1996 
1997 // #define SUPPORT_BF16_SUBNORMALS
1998 
1999 #ifndef __VEC_CLASS_FP_NAN
2000 #define __VEC_CLASS_FP_NAN (1 << 6)
2001 #endif
2002 
2003 #if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
2004 #define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2005 #define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2006 
2007 #define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2008 #endif
2009 
2010 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2011 #ifdef _ARCH_PWR10
2012  return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
2013 #else
2014  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
2015  Packet4ui lsb = plogical_shift_right<16>(input);
2016  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
2017 
2018  EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
2019  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
2020  input = padd<Packet4ui>(input, rounding_bias);
2021 
2022  const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2023 #if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2024  Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2025  input = vec_sel(input, p4ui_nan, nan_selector);
2026 
2027 #ifdef SUPPORT_BF16_SUBNORMALS
2028  Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2029  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2030 #endif
2031 #else
2032 #ifdef SUPPORT_BF16_SUBNORMALS
2033  // Test NaN and Subnormal
2034  const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2035  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
2036 
2037  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2038  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
2039 
2040  Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
2041  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
2042 
2043  Packet4ui nan_selector =
2044  pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2045 
2046  Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
2047 
2048  Packet4ui subnormal_selector =
2049  pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2050 
2051  input = vec_sel(input, p4ui_nan, nan_selector);
2052  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2053 #else
2054  // Test only NaN
2055  Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2056 
2057  input = vec_sel(p4ui_nan, input, nan_selector);
2058 #endif
2059 #endif
2060 
2061  input = plogical_shift_right<16>(input);
2062  return reinterpret_cast<Packet8us>(input);
2063 #endif
2064 }
2065 
2066 #ifdef _BIG_ENDIAN
2067 
2072 template <bool lohi>
2073 EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2074  if (lohi) {
2075  return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
2076  } else {
2077  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2078  }
2079 }
2080 
2086 template <bool lohi>
2087 EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2088  if (lohi) {
2089  return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
2090  } else {
2091  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2092  }
2093 }
2094 #else
2095 template <bool lohi>
2096 EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2097  if (lohi) {
2098  return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
2099  } else {
2100  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2101  }
2102 }
2103 
2104 template <bool lohi>
2105 EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2106  if (lohi) {
2107  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
2108  } else {
2109  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2110  }
2111 }
2112 #endif
2113 
2119 template <bool lohi = true>
2120 EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2121  Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2122  Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2123 
2124  Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2125  EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2126  lsb = padd<Packet8us>(lsb, p8us_BIAS);
2127  lsb = padd<Packet8us>(lsb, p4f2);
2128 
2129  Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2130  Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
2131 
2132 #if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2133  Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2134  Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2135  Packet8us nan_selector =
2136  Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2137 
2138  input = vec_sel(input, p8us_BIAS, nan_selector);
2139 
2140 #ifdef SUPPORT_BF16_SUBNORMALS
2141  Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2142  Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2143  Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
2144  reinterpret_cast<Packet4f>(subnormal_selector_hi));
2145 
2146  input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2147 #endif
2148 #else
2149 #ifdef SUPPORT_BF16_SUBNORMALS
2150  // Test NaN and Subnormal
2151  const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2152  Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
2153 
2154  const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2155  Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2156 
2157  Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
2158  Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
2159 
2160  Packet8us nan_selector =
2161  pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2162 
2163  Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
2164 
2165  Packet8us subnormal_selector =
2166  pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2167 
2168  // Using BIAS as NaN (since any or all of the last 7 bits can be set)
2169  input = vec_sel(input, p8us_BIAS, nan_selector);
2170  input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2171 #else
2172  // Test only NaN
2173  Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2174  Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2175  Packet8us nan_selector =
2176  Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2177 
2178  input = vec_sel(p8us_BIAS, input, nan_selector);
2179 #endif
2180 #endif
2181 
2182  return input;
2183 }
2184 
2188 EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2189 #ifdef _ARCH_PWR10
2190  Packet8bf fp16_0 = F32ToBf16(lo);
2191  Packet8bf fp16_1 = F32ToBf16(hi);
2192  return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
2193 #else
2194  return F32ToBf16Two(lo, hi);
2195 #endif
2196 }
2197 
2201 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2202 #ifdef _ARCH_PWR10
2203  return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
2204 #else
2205  return F32ToBf16Two<false>(even, odd);
2206 #endif
2207 }
2208 #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
2209  Packet4f a_even = Bf16ToF32Even(A); \
2210  Packet4f a_odd = Bf16ToF32Odd(A); \
2211  Packet4f op_even = OP(a_even); \
2212  Packet4f op_odd = OP(a_odd); \
2213  return F32ToBf16(op_even, op_odd);
2214 
2215 #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
2216  Packet4f a_even = Bf16ToF32Even(A); \
2217  Packet4f a_odd = Bf16ToF32Odd(A); \
2218  Packet4f b_even = Bf16ToF32Even(B); \
2219  Packet4f b_odd = Bf16ToF32Odd(B); \
2220  Packet4f op_even = OP(a_even, b_even); \
2221  Packet4f op_odd = OP(a_odd, b_odd); \
2222  return F32ToBf16(op_even, op_odd);
2223 
2224 #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
2225  Packet4f a_even = Bf16ToF32Even(A); \
2226  Packet4f a_odd = Bf16ToF32Odd(A); \
2227  Packet4f b_even = Bf16ToF32Even(B); \
2228  Packet4f b_odd = Bf16ToF32Odd(B); \
2229  Packet4f op_even = OP(a_even, b_even); \
2230  Packet4f op_odd = OP(a_odd, b_odd); \
2231  return F32ToBf16Bool(op_even, op_odd);
2232 
2233 template <>
2234 EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2235  BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
2236 }
2237 
2238 template <>
2239 EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2240  BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
2241 }
2242 
2243 template <>
2244 EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2245  BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
2246 }
2247 
2248 template <>
2249 EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
2250  EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2251  return pxor<Packet8us>(p8us_neg_mask, a);
2252 }
2253 
2254 template <>
2255 EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2256  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
2257 }
2258 
2259 template <>
2260 EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
2261  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
2262 }
2263 
2264 template <>
2265 EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
2266  BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2267 }
2268 
2269 template <>
2270 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2271  return pldexp_generic(a, exponent);
2272 }
2273 template <>
2274 EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
2275  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
2276 }
2277 
2278 template <>
2279 EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2280  return pfrexp_generic(a, exponent);
2281 }
2282 template <>
2283 EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
2284  Packet4f a_even = Bf16ToF32Even(a);
2285  Packet4f a_odd = Bf16ToF32Odd(a);
2286  Packet4f e_even;
2287  Packet4f e_odd;
2288  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
2289  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
2290  e = F32ToBf16(e_even, e_odd);
2291  return F32ToBf16(op_even, op_odd);
2292 }
2293 
2294 template <>
2295 EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
2296  BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
2297 }
2298 template <>
2299 EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
2300  BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
2301 }
2302 template <>
2303 EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
2304  BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
2305 }
2306 template <>
2307 EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
2308  BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
2309 }
2310 template <>
2311 EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
2312  BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
2313 }
2314 template <>
2315 EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
2316  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
2317 }
2318 template <>
2319 EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
2320  BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2321 }
2322 #ifdef EIGEN_VECTORIZE_VSX
2323 template <>
2324 EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
2325  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
2326 }
2327 #endif
2328 template <>
2329 EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2330  Packet4f a_even = Bf16ToF32Even(a);
2331  Packet4f a_odd = Bf16ToF32Odd(a);
2332  Packet4f b_even = Bf16ToF32Even(b);
2333  Packet4f b_odd = Bf16ToF32Odd(b);
2334  Packet4f c_even = Bf16ToF32Even(c);
2335  Packet4f c_odd = Bf16ToF32Odd(c);
2336  Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
2337  Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
2338  return F32ToBf16(pmadd_even, pmadd_odd);
2339 }
2340 
2341 template <>
2342 EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2343  Packet4f a_even = Bf16ToF32Even(a);
2344  Packet4f a_odd = Bf16ToF32Odd(a);
2345  Packet4f b_even = Bf16ToF32Even(b);
2346  Packet4f b_odd = Bf16ToF32Odd(b);
2347  Packet4f c_even = Bf16ToF32Even(c);
2348  Packet4f c_odd = Bf16ToF32Odd(c);
2349  Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2350  Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2351  return F32ToBf16(pmadd_even, pmadd_odd);
2352 }
2353 template <>
2354 EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2355  Packet4f a_even = Bf16ToF32Even(a);
2356  Packet4f a_odd = Bf16ToF32Odd(a);
2357  Packet4f b_even = Bf16ToF32Even(b);
2358  Packet4f b_odd = Bf16ToF32Odd(b);
2359  Packet4f c_even = Bf16ToF32Even(c);
2360  Packet4f c_odd = Bf16ToF32Odd(c);
2361  Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2362  Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2363  return F32ToBf16(pmadd_even, pmadd_odd);
2364 }
2365 
2366 template <>
2367 EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2368  Packet4f a_even = Bf16ToF32Even(a);
2369  Packet4f a_odd = Bf16ToF32Odd(a);
2370  Packet4f b_even = Bf16ToF32Even(b);
2371  Packet4f b_odd = Bf16ToF32Odd(b);
2372  Packet4f c_even = Bf16ToF32Even(c);
2373  Packet4f c_odd = Bf16ToF32Odd(c);
2374  Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2375  Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2376  return F32ToBf16(pmadd_even, pmadd_odd);
2377 }
2378 
2379 template <>
2380 EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2381  BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
2382 }
2383 
2384 template <>
2385 EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2386  BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
2387 }
2388 
2389 template <>
2390 EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2391  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
2392 }
2393 template <>
2394 EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2395  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
2396 }
2397 template <>
2398 EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2399  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
2400 }
2401 template <>
2402 EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2403  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
2404 }
2405 
2406 template <>
2407 EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
2408  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
2409 }
2410 
2411 template <>
2412 EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
2413  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
2414 }
2415 
2416 template <>
2417 EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2418  bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2419  bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
2420  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2421 }
2422 
2423 template <>
2424 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
2425  Packet4f b, sum;
2426  b = vec_sld(a, a, 8);
2427  sum = a + b;
2428  b = vec_sld(sum, sum, 4);
2429  sum += b;
2430  return pfirst(sum);
2431 }
2432 
2433 template <>
2434 EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
2435  Packet4i b, sum;
2436  b = vec_sld(a, a, 8);
2437  sum = a + b;
2438  b = vec_sld(sum, sum, 4);
2439  sum += b;
2440  return pfirst(sum);
2441 }
2442 
2443 template <>
2444 EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
2445  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2446  float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2447  float f32_result = redux_even + redux_odd;
2448  return bfloat16(f32_result);
2449 }
2450 template <typename Packet>
2451 EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
2452  union {
2453  Packet v;
2454  __UNPACK_TYPE__(Packet) n[8];
2455  } vt;
2456  vt.v = a;
2457 
2458  EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2459  EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2460  Packet4i first_half = pload<Packet4i>(first_loader);
2461  Packet4i second_half = pload<Packet4i>(second_loader);
2462 
2463  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
2464 }
2465 
2466 template <>
2467 EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
2468  return predux_size8<Packet8s>(a);
2469 }
2470 
2471 template <>
2472 EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
2473  return predux_size8<Packet8us>(a);
2474 }
2475 
2476 template <typename Packet>
2477 EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
2478  union {
2479  Packet v;
2480  __UNPACK_TYPE__(Packet) n[16];
2481  } vt;
2482  vt.v = a;
2483 
2484  EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2485  EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2486  EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2487  EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
2488 
2489  Packet4i first_quarter = pload<Packet4i>(first_loader);
2490  Packet4i second_quarter = pload<Packet4i>(second_loader);
2491  Packet4i third_quarter = pload<Packet4i>(third_loader);
2492  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2493 
2494  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2495  predux(fourth_quarter));
2496 }
2497 
2498 template <>
2499 EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
2500  return predux_size16<Packet16c>(a);
2501 }
2502 
2503 template <>
2504 EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
2505  return predux_size16<Packet16uc>(a);
2506 }
2507 
2508 // Other reduction functions:
2509 // mul
2510 template <>
2511 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
2512  Packet4f prod;
2513  prod = pmul(a, vec_sld(a, a, 8));
2514  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2515 }
2516 
2517 template <>
2518 EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
2519  EIGEN_ALIGN16 int aux[4];
2520  pstore(aux, a);
2521  return aux[0] * aux[1] * aux[2] * aux[3];
2522 }
2523 
2524 template <>
2525 EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
2526  Packet8s pair, quad, octo;
2527 
2528  pair = vec_mul(a, vec_sld(a, a, 8));
2529  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2530  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2531 
2532  return pfirst(octo);
2533 }
2534 
2535 template <>
2536 EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
2537  Packet8us pair, quad, octo;
2538 
2539  pair = vec_mul(a, vec_sld(a, a, 8));
2540  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2541  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2542 
2543  return pfirst(octo);
2544 }
2545 
2546 template <>
2547 EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
2548  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2549  float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2550  float f32_result = redux_even * redux_odd;
2551  return bfloat16(f32_result);
2552 }
2553 
2554 template <>
2555 EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
2556  Packet16c pair, quad, octo, result;
2557 
2558  pair = vec_mul(a, vec_sld(a, a, 8));
2559  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2560  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2561  result = vec_mul(octo, vec_sld(octo, octo, 1));
2562 
2563  return pfirst(result);
2564 }
2565 
2566 template <>
2567 EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
2568  Packet16uc pair, quad, octo, result;
2569 
2570  pair = vec_mul(a, vec_sld(a, a, 8));
2571  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2572  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2573  result = vec_mul(octo, vec_sld(octo, octo, 1));
2574 
2575  return pfirst(result);
2576 }
2577 
2578 // min
2579 template <typename Packet>
2580 EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
2581  Packet b, res;
2582  b = vec_min(a, vec_sld(a, a, 8));
2583  res = vec_min(b, vec_sld(b, b, 4));
2584  return pfirst(res);
2585 }
2586 
2587 template <>
2588 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2589  return predux_min4<Packet4f>(a);
2590 }
2591 
2592 template <>
2593 EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
2594  return predux_min4<Packet4i>(a);
2595 }
2596 
2597 template <>
2598 EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
2599  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2600  float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2601  float f32_result = (std::min)(redux_even, redux_odd);
2602  return bfloat16(f32_result);
2603 }
2604 
2605 template <>
2606 EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
2607  Packet8s pair, quad, octo;
2608 
2609  // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2610  pair = vec_min(a, vec_sld(a, a, 8));
2611 
2612  // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2613  quad = vec_min(pair, vec_sld(pair, pair, 4));
2614 
2615  // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2616  octo = vec_min(quad, vec_sld(quad, quad, 2));
2617  return pfirst(octo);
2618 }
2619 
2620 template <>
2621 EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
2622  Packet8us pair, quad, octo;
2623 
2624  // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2625  pair = vec_min(a, vec_sld(a, a, 8));
2626 
2627  // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2628  quad = vec_min(pair, vec_sld(pair, pair, 4));
2629 
2630  // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2631  octo = vec_min(quad, vec_sld(quad, quad, 2));
2632  return pfirst(octo);
2633 }
2634 
2635 template <>
2636 EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
2637  Packet16c pair, quad, octo, result;
2638 
2639  pair = vec_min(a, vec_sld(a, a, 8));
2640  quad = vec_min(pair, vec_sld(pair, pair, 4));
2641  octo = vec_min(quad, vec_sld(quad, quad, 2));
2642  result = vec_min(octo, vec_sld(octo, octo, 1));
2643 
2644  return pfirst(result);
2645 }
2646 
2647 template <>
2648 EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
2649  Packet16uc pair, quad, octo, result;
2650 
2651  pair = vec_min(a, vec_sld(a, a, 8));
2652  quad = vec_min(pair, vec_sld(pair, pair, 4));
2653  octo = vec_min(quad, vec_sld(quad, quad, 2));
2654  result = vec_min(octo, vec_sld(octo, octo, 1));
2655 
2656  return pfirst(result);
2657 }
2658 // max
2659 template <typename Packet>
2660 EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
2661  Packet b, res;
2662  b = vec_max(a, vec_sld(a, a, 8));
2663  res = vec_max(b, vec_sld(b, b, 4));
2664  return pfirst(res);
2665 }
2666 
2667 template <>
2668 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2669  return predux_max4<Packet4f>(a);
2670 }
2671 
2672 template <>
2673 EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
2674  return predux_max4<Packet4i>(a);
2675 }
2676 
2677 template <>
2678 EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
2679  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2680  float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2681  float f32_result = (std::max)(redux_even, redux_odd);
2682  return bfloat16(f32_result);
2683 }
2684 
2685 template <>
2686 EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
2687  Packet8s pair, quad, octo;
2688 
2689  // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2690  pair = vec_max(a, vec_sld(a, a, 8));
2691 
2692  // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2693  quad = vec_max(pair, vec_sld(pair, pair, 4));
2694 
2695  // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2696  octo = vec_max(quad, vec_sld(quad, quad, 2));
2697  return pfirst(octo);
2698 }
2699 
2700 template <>
2701 EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
2702  Packet8us pair, quad, octo;
2703 
2704  // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2705  pair = vec_max(a, vec_sld(a, a, 8));
2706 
2707  // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2708  quad = vec_max(pair, vec_sld(pair, pair, 4));
2709 
2710  // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2711  octo = vec_max(quad, vec_sld(quad, quad, 2));
2712  return pfirst(octo);
2713 }
2714 
2715 template <>
2716 EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
2717  Packet16c pair, quad, octo, result;
2718 
2719  pair = vec_max(a, vec_sld(a, a, 8));
2720  quad = vec_max(pair, vec_sld(pair, pair, 4));
2721  octo = vec_max(quad, vec_sld(quad, quad, 2));
2722  result = vec_max(octo, vec_sld(octo, octo, 1));
2723 
2724  return pfirst(result);
2725 }
2726 
2727 template <>
2728 EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
2729  Packet16uc pair, quad, octo, result;
2730 
2731  pair = vec_max(a, vec_sld(a, a, 8));
2732  quad = vec_max(pair, vec_sld(pair, pair, 4));
2733  octo = vec_max(quad, vec_sld(quad, quad, 2));
2734  result = vec_max(octo, vec_sld(octo, octo, 1));
2735 
2736  return pfirst(result);
2737 }
2738 
2739 template <>
2740 EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
2741  return vec_any_ne(x, pzero(x));
2742 }
2743 
2744 template <typename T>
2745 EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
2746  T t0, t1, t2, t3;
2747  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2748  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2749  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2750  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2751  kernel.packet[0] = vec_mergeh(t0, t2);
2752  kernel.packet[1] = vec_mergel(t0, t2);
2753  kernel.packet[2] = vec_mergeh(t1, t3);
2754  kernel.packet[3] = vec_mergel(t1, t3);
2755 }
2756 
2757 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
2758 
2759 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
2760 
2761 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2762  Packet8s t0, t1, t2, t3;
2763  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2764  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2765  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2766  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2767  kernel.packet[0] = vec_mergeh(t0, t2);
2768  kernel.packet[1] = vec_mergel(t0, t2);
2769  kernel.packet[2] = vec_mergeh(t1, t3);
2770  kernel.packet[3] = vec_mergel(t1, t3);
2771 }
2772 
2773 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2774  Packet8us t0, t1, t2, t3;
2775  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2776  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2777  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2778  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2779  kernel.packet[0] = vec_mergeh(t0, t2);
2780  kernel.packet[1] = vec_mergel(t0, t2);
2781  kernel.packet[2] = vec_mergeh(t1, t3);
2782  kernel.packet[3] = vec_mergel(t1, t3);
2783 }
2784 
2785 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2786  Packet8us t0, t1, t2, t3;
2787 
2788  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2789  t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2790  t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2791  t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2792  kernel.packet[0] = vec_mergeh(t0, t2);
2793  kernel.packet[1] = vec_mergel(t0, t2);
2794  kernel.packet[2] = vec_mergeh(t1, t3);
2795  kernel.packet[3] = vec_mergel(t1, t3);
2796 }
2797 
2798 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2799  Packet16c t0, t1, t2, t3;
2800  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2801  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2802  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2803  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2804  kernel.packet[0] = vec_mergeh(t0, t2);
2805  kernel.packet[1] = vec_mergel(t0, t2);
2806  kernel.packet[2] = vec_mergeh(t1, t3);
2807  kernel.packet[3] = vec_mergel(t1, t3);
2808 }
2809 
2810 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2811  Packet16uc t0, t1, t2, t3;
2812  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2813  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2814  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2815  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2816  kernel.packet[0] = vec_mergeh(t0, t2);
2817  kernel.packet[1] = vec_mergel(t0, t2);
2818  kernel.packet[2] = vec_mergeh(t1, t3);
2819  kernel.packet[3] = vec_mergel(t1, t3);
2820 }
2821 
2822 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2823  Packet8s v[8], sum[8];
2824 
2825  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2826  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2827  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2828  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2829  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2830  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2831  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2832  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2833  sum[0] = vec_mergeh(v[0], v[4]);
2834  sum[1] = vec_mergel(v[0], v[4]);
2835  sum[2] = vec_mergeh(v[1], v[5]);
2836  sum[3] = vec_mergel(v[1], v[5]);
2837  sum[4] = vec_mergeh(v[2], v[6]);
2838  sum[5] = vec_mergel(v[2], v[6]);
2839  sum[6] = vec_mergeh(v[3], v[7]);
2840  sum[7] = vec_mergel(v[3], v[7]);
2841 
2842  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2843  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2844  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2845  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2846  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2847  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2848  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2849  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2850 }
2851 
2852 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2853  Packet8us v[8], sum[8];
2854 
2855  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2856  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2857  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2858  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2859  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2860  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2861  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2862  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2863  sum[0] = vec_mergeh(v[0], v[4]);
2864  sum[1] = vec_mergel(v[0], v[4]);
2865  sum[2] = vec_mergeh(v[1], v[5]);
2866  sum[3] = vec_mergel(v[1], v[5]);
2867  sum[4] = vec_mergeh(v[2], v[6]);
2868  sum[5] = vec_mergel(v[2], v[6]);
2869  sum[6] = vec_mergeh(v[3], v[7]);
2870  sum[7] = vec_mergel(v[3], v[7]);
2871 
2872  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2873  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2874  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2875  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2876  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2877  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2878  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2879  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2880 }
2881 
2882 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2883  Packet8bf v[8], sum[8];
2884 
2885  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2886  v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2887  v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2888  v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2889  v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2890  v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2891  v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2892  v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2893  sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2894  sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2895  sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2896  sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2897  sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2898  sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2899  sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2900  sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2901 
2902  kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2903  kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2904  kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2905  kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2906  kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2907  kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2908  kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2909  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2910 }
2911 
2912 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2913  Packet16c step1[16], step2[16], step3[16];
2914 
2915  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2916  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2917  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2918  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2919  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2920  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2921  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2922  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2923  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2924  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2925  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2926  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2927  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2928  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2929  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2930  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2931 
2932  step2[0] = vec_mergeh(step1[0], step1[8]);
2933  step2[1] = vec_mergel(step1[0], step1[8]);
2934  step2[2] = vec_mergeh(step1[1], step1[9]);
2935  step2[3] = vec_mergel(step1[1], step1[9]);
2936  step2[4] = vec_mergeh(step1[2], step1[10]);
2937  step2[5] = vec_mergel(step1[2], step1[10]);
2938  step2[6] = vec_mergeh(step1[3], step1[11]);
2939  step2[7] = vec_mergel(step1[3], step1[11]);
2940  step2[8] = vec_mergeh(step1[4], step1[12]);
2941  step2[9] = vec_mergel(step1[4], step1[12]);
2942  step2[10] = vec_mergeh(step1[5], step1[13]);
2943  step2[11] = vec_mergel(step1[5], step1[13]);
2944  step2[12] = vec_mergeh(step1[6], step1[14]);
2945  step2[13] = vec_mergel(step1[6], step1[14]);
2946  step2[14] = vec_mergeh(step1[7], step1[15]);
2947  step2[15] = vec_mergel(step1[7], step1[15]);
2948 
2949  step3[0] = vec_mergeh(step2[0], step2[8]);
2950  step3[1] = vec_mergel(step2[0], step2[8]);
2951  step3[2] = vec_mergeh(step2[1], step2[9]);
2952  step3[3] = vec_mergel(step2[1], step2[9]);
2953  step3[4] = vec_mergeh(step2[2], step2[10]);
2954  step3[5] = vec_mergel(step2[2], step2[10]);
2955  step3[6] = vec_mergeh(step2[3], step2[11]);
2956  step3[7] = vec_mergel(step2[3], step2[11]);
2957  step3[8] = vec_mergeh(step2[4], step2[12]);
2958  step3[9] = vec_mergel(step2[4], step2[12]);
2959  step3[10] = vec_mergeh(step2[5], step2[13]);
2960  step3[11] = vec_mergel(step2[5], step2[13]);
2961  step3[12] = vec_mergeh(step2[6], step2[14]);
2962  step3[13] = vec_mergel(step2[6], step2[14]);
2963  step3[14] = vec_mergeh(step2[7], step2[15]);
2964  step3[15] = vec_mergel(step2[7], step2[15]);
2965 
2966  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2967  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2968  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2969  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2970  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2971  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2972  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2973  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2974  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2975  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2976  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2977  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2978  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2979  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2980  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2981  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2982 }
2983 
2984 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2985  Packet16uc step1[16], step2[16], step3[16];
2986 
2987  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2988  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2989  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2990  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2991  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2992  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2993  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2994  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2995  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2996  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2997  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2998  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2999  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
3000  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
3001  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
3002  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
3003 
3004  step2[0] = vec_mergeh(step1[0], step1[8]);
3005  step2[1] = vec_mergel(step1[0], step1[8]);
3006  step2[2] = vec_mergeh(step1[1], step1[9]);
3007  step2[3] = vec_mergel(step1[1], step1[9]);
3008  step2[4] = vec_mergeh(step1[2], step1[10]);
3009  step2[5] = vec_mergel(step1[2], step1[10]);
3010  step2[6] = vec_mergeh(step1[3], step1[11]);
3011  step2[7] = vec_mergel(step1[3], step1[11]);
3012  step2[8] = vec_mergeh(step1[4], step1[12]);
3013  step2[9] = vec_mergel(step1[4], step1[12]);
3014  step2[10] = vec_mergeh(step1[5], step1[13]);
3015  step2[11] = vec_mergel(step1[5], step1[13]);
3016  step2[12] = vec_mergeh(step1[6], step1[14]);
3017  step2[13] = vec_mergel(step1[6], step1[14]);
3018  step2[14] = vec_mergeh(step1[7], step1[15]);
3019  step2[15] = vec_mergel(step1[7], step1[15]);
3020 
3021  step3[0] = vec_mergeh(step2[0], step2[8]);
3022  step3[1] = vec_mergel(step2[0], step2[8]);
3023  step3[2] = vec_mergeh(step2[1], step2[9]);
3024  step3[3] = vec_mergel(step2[1], step2[9]);
3025  step3[4] = vec_mergeh(step2[2], step2[10]);
3026  step3[5] = vec_mergel(step2[2], step2[10]);
3027  step3[6] = vec_mergeh(step2[3], step2[11]);
3028  step3[7] = vec_mergel(step2[3], step2[11]);
3029  step3[8] = vec_mergeh(step2[4], step2[12]);
3030  step3[9] = vec_mergel(step2[4], step2[12]);
3031  step3[10] = vec_mergeh(step2[5], step2[13]);
3032  step3[11] = vec_mergel(step2[5], step2[13]);
3033  step3[12] = vec_mergeh(step2[6], step2[14]);
3034  step3[13] = vec_mergel(step2[6], step2[14]);
3035  step3[14] = vec_mergeh(step2[7], step2[15]);
3036  step3[15] = vec_mergel(step2[7], step2[15]);
3037 
3038  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3039  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3040  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3041  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3042  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3043  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3044  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3045  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3046  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3047  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3048  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
3049  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
3050  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
3051  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
3052  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
3053  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
3054 }
3055 
3056 template <typename Packet>
3057 EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
3058  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
3059  Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
3060  return vec_sel(elsePacket, thenPacket, mask);
3061 }
3062 
3063 template <>
3064 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
3065  const Packet4i& elsePacket) {
3066  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
3067 }
3068 
3069 template <>
3070 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
3071  const Packet4f& elsePacket) {
3072  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
3073 }
3074 
3075 template <>
3076 EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
3077  const Packet8s& elsePacket) {
3078  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3079  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3080  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
3081  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
3082  return result;
3083 }
3084 
3085 template <>
3086 EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
3087  const Packet8us& elsePacket) {
3088  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3089  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3090  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
3091  return vec_sel(elsePacket, thenPacket, mask);
3092 }
3093 
3094 template <>
3095 EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
3096  const Packet8bf& elsePacket) {
3097  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
3098 }
3099 
3100 template <>
3101 EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
3102  const Packet16c& elsePacket) {
3103  Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3104  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3105  ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3106  ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3107 
3108  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
3109  return vec_sel(elsePacket, thenPacket, mask);
3110 }
3111 
3112 template <>
3113 EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
3114  const Packet16uc& elsePacket) {
3115  Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3116  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3117  ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3118  ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3119 
3120  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
3121  return vec_sel(elsePacket, thenPacket, mask);
3122 }
3123 
3124 //---------- double ----------
3125 #ifdef EIGEN_VECTORIZE_VSX
3126 typedef __vector double Packet2d;
3127 typedef __vector unsigned long long Packet2ul;
3128 typedef __vector long long Packet2l;
3129 #if EIGEN_COMP_CLANG
3130 typedef Packet2ul Packet2bl;
3131 #else
3132 typedef __vector __bool long Packet2bl;
3133 #endif
3134 
3135 static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
3136 static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3137 static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3138 static Packet2d p2d_ONE = {1.0, 1.0};
3139 static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
3140 static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3141  numext::bit_cast<double>(0x8000000000000000ull)};
3142 
3143 #ifdef _BIG_ENDIAN
3144 static Packet2d p2d_COUNTDOWN =
3145  reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
3146 #else
3147 static Packet2d p2d_COUNTDOWN =
3148  reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
3149 #endif
3150 
3151 template <int index>
3152 Packet2d vec_splat_dbl(Packet2d& a) {
3153  return vec_splat(a, index);
3154 }
3155 
3156 template <>
3157 struct packet_traits<double> : default_packet_traits {
3158  typedef Packet2d type;
3159  typedef Packet2d half;
3160  enum {
3161  Vectorizable = 1,
3162  AlignedOnScalar = 1,
3163  size = 2,
3164 
3165  HasAdd = 1,
3166  HasSub = 1,
3167  HasMul = 1,
3168  HasDiv = 1,
3169  HasMin = 1,
3170  HasMax = 1,
3171  HasAbs = 1,
3172  HasSin = EIGEN_FAST_MATH,
3173  HasCos = EIGEN_FAST_MATH,
3174  HasTanh = EIGEN_FAST_MATH,
3175  HasErf = EIGEN_FAST_MATH,
3176  HasErfc = EIGEN_FAST_MATH,
3177  HasATanh = 1,
3178  HasATan = 0,
3179  HasLog = 0,
3180  HasCmp = 1,
3181  HasExp = 1,
3182  HasSqrt = 1,
3183  HasCbrt = 1,
3184 #if !EIGEN_COMP_CLANG
3185  HasRsqrt = 1,
3186 #else
3187  HasRsqrt = 0,
3188 #endif
3189  HasNegate = 1,
3190  HasBlend = 1
3191  };
3192 };
3193 
3194 template <>
3195 struct unpacket_traits<Packet2d> {
3196  typedef double type;
3197  typedef Packet2l integer_packet;
3198  enum {
3199  size = 2,
3200  alignment = Aligned16,
3201  vectorizable = true,
3202  masked_load_available = false,
3203  masked_store_available = false
3204  };
3205  typedef Packet2d half;
3206 };
3207 template <>
3208 struct unpacket_traits<Packet2l> {
3209  typedef int64_t type;
3210  typedef Packet2l half;
3211  enum {
3212  size = 2,
3213  alignment = Aligned16,
3214  vectorizable = false,
3215  masked_load_available = false,
3216  masked_store_available = false
3217  };
3218 };
3219 
3220 inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
3221  union {
3222  Packet2l v;
3223  int64_t n[2];
3224  } vt;
3225  vt.v = v;
3226  s << vt.n[0] << ", " << vt.n[1];
3227  return s;
3228 }
3229 
3230 inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
3231  union {
3232  Packet2d v;
3233  double n[2];
3234  } vt;
3235  vt.v = v;
3236  s << vt.n[0] << ", " << vt.n[1];
3237  return s;
3238 }
3239 
3240 // Need to define them first or we get specialization after instantiation errors
3241 template <>
3242 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
3243  EIGEN_DEBUG_ALIGNED_LOAD
3244  return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
3245 }
3246 
3247 template <>
3248 EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3249  return pload_partial_common<Packet2d>(from, n, offset);
3250 }
3251 
3252 template <>
3253 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
3254  EIGEN_DEBUG_ALIGNED_STORE
3255  vec_xst(from, 0, to);
3256 }
3257 
3258 template <>
3259 EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3260  pstore_partial_common<Packet2d>(to, from, n, offset);
3261 }
3262 
3263 template <>
3264 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
3265  Packet2d v = {from, from};
3266  return v;
3267 }
3268 template <>
3269 EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
3270  Packet2l v = {from, from};
3271  return v;
3272 }
3273 
3274 template <>
3275 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
3276  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
3277  return reinterpret_cast<Packet2d>(v);
3278 }
3279 
3280 template <>
3281 EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3282  Packet2d& a3) {
3283  // This way is faster than vec_splat (at least for doubles in Power 9)
3284  a0 = pset1<Packet2d>(a[0]);
3285  a1 = pset1<Packet2d>(a[1]);
3286  a2 = pset1<Packet2d>(a[2]);
3287  a3 = pset1<Packet2d>(a[3]);
3288 }
3289 
3290 template <>
3291 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
3292  return pgather_common<Packet2d>(from, stride);
3293 }
3294 template <>
3295 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
3296  const Index n) {
3297  return pgather_common<Packet2d>(from, stride, n);
3298 }
3299 template <>
3300 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
3301  pscatter_common<Packet2d>(to, from, stride);
3302 }
3303 template <>
3304 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
3305  Index stride, const Index n) {
3306  pscatter_common<Packet2d>(to, from, stride, n);
3307 }
3308 
3309 template <>
3310 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
3311  return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3312 }
3313 
3314 template <>
3315 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
3316  return a + b;
3317 }
3318 
3319 template <>
3320 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
3321  return a - b;
3322 }
3323 
3324 template <>
3325 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
3326 #ifdef __POWER8_VECTOR__
3327  return vec_neg(a);
3328 #else
3329  return vec_xor(a, p2d_MZERO);
3330 #endif
3331 }
3332 
3333 template <>
3334 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
3335  return a;
3336 }
3337 
3338 template <>
3339 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
3340  return vec_madd(a, b, p2d_MZERO);
3341 }
3342 template <>
3343 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
3344  return vec_div(a, b);
3345 }
3346 
3347 // for some weird raisons, it has to be overloaded for packet of integers
3348 template <>
3349 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3350  return vec_madd(a, b, c);
3351 }
3352 template <>
3353 EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3354  return vec_msub(a, b, c);
3355 }
3356 template <>
3357 EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3358  return vec_nmsub(a, b, c);
3359 }
3360 template <>
3361 EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3362  return vec_nmadd(a, b, c);
3363 }
3364 
3365 template <>
3366 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
3367  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
3368  Packet2d ret;
3369  __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
3370  return ret;
3371 }
3372 
3373 template <>
3374 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
3375  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
3376  Packet2d ret;
3377  __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
3378  return ret;
3379 }
3380 
3381 template <>
3382 EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
3383  return reinterpret_cast<Packet2d>(vec_cmple(a, b));
3384 }
3385 template <>
3386 EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
3387  return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
3388 }
3389 template <>
3390 EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
3391  return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
3392 }
3393 template <>
3394 #ifdef __POWER8_VECTOR__
3395 EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3396  return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
3397 }
3398 #else
3399 EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3400  Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
3401  Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3402  return reinterpret_cast<Packet2l>(pand(halves, flipped));
3403 }
3404 #endif
3405 template <>
3406 EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
3407  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
3408  return vec_nor(c, c);
3409 }
3410 
3411 template <>
3412 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
3413  return vec_and(a, b);
3414 }
3415 
3416 template <>
3417 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
3418  return vec_or(a, b);
3419 }
3420 
3421 template <>
3422 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
3423  return vec_xor(a, b);
3424 }
3425 
3426 template <>
3427 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
3428  return vec_and(a, vec_nor(b, b));
3429 }
3430 
3431 template <>
3432 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
3433  Packet2d t = vec_add(
3434  reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3435  Packet2d res;
3436 
3437  __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
3438 
3439  return res;
3440 }
3441 template <>
3442 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
3443  return vec_ceil(a);
3444 }
3445 template <>
3446 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
3447  return vec_floor(a);
3448 }
3449 template <>
3450 EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
3451  return vec_trunc(a);
3452 }
3453 template <>
3454 EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
3455  Packet2d res;
3456 
3457  __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
3458 
3459  return res;
3460 }
3461 
3462 template <>
3463 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
3464  EIGEN_DEBUG_UNALIGNED_LOAD
3465  return vec_xl(0, const_cast<double*>(from));
3466 }
3467 
3468 template <>
3469 EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3470  return ploadu_partial_common<Packet2d>(from, n, offset);
3471 }
3472 
3473 template <>
3474 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
3475  Packet2d p;
3476  if ((std::ptrdiff_t(from) % 16) == 0)
3477  p = pload<Packet2d>(from);
3478  else
3479  p = ploadu<Packet2d>(from);
3480  return vec_splat_dbl<0>(p);
3481 }
3482 
3483 template <>
3484 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
3485  EIGEN_DEBUG_UNALIGNED_STORE
3486  vec_xst(from, 0, to);
3487 }
3488 
3489 template <>
3490 EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3491  pstoreu_partial_common<Packet2d>(to, from, n, offset);
3492 }
3493 
3494 template <>
3495 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
3496  EIGEN_PPC_PREFETCH(addr);
3497 }
3498 
3499 template <>
3500 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
3501  EIGEN_ALIGN16 double x[2];
3502  pstore<double>(x, a);
3503  return x[0];
3504 }
3505 
3506 template <>
3507 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
3508  return vec_sld(a, a, 8);
3509 }
3510 template <>
3511 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
3512  return vec_abs(a);
3513 }
3514 #ifdef __POWER8_VECTOR__
3515 template <>
3516 EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3517  return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
3518 }
3519 #else
3520 #ifdef _BIG_ENDIAN
3521 static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3522 #else
3523 static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
3524 #endif
3525 
3526 template <>
3527 EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3528  Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
3529  return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
3530 }
3531 #endif
3532 
3533 template <>
3534 inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
3535 
3536 template <>
3537 inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
3538 
3539 // Packet2l shifts.
3540 // For POWER8 we simply use vec_sr/l.
3541 //
3542 // Things are more complicated for POWER7. There is actually a
3543 // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
3544 // So we need to shift by N % 32 and rearrage bytes.
3545 #ifdef __POWER8_VECTOR__
3546 
3547 template <int N>
3548 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3549  const Packet2ul shift = {N, N};
3550  return vec_sl(a, shift);
3551 }
3552 
3553 template <int N>
3554 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3555  const Packet2ul shift = {N, N};
3556  return vec_sr(a, shift);
3557 }
3558 
3559 #else
3560 
3561 // Shifts [A, B, C, D] to [B, 0, D, 0].
3562 // Used to implement left shifts for Packet2l.
3563 EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
3564  static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3565  0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3566 #ifdef _BIG_ENDIAN
3567  return vec_perm(p4i_ZERO, a, perm);
3568 #else
3569  return vec_perm(a, p4i_ZERO, perm);
3570 #endif
3571 }
3572 
3573 // Shifts [A, B, C, D] to [0, A, 0, C].
3574 // Used to implement right shifts for Packet2l.
3575 EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
3576  static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3577  0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
3578 #ifdef _BIG_ENDIAN
3579  return vec_perm(p4i_ZERO, a, perm);
3580 #else
3581  return vec_perm(a, p4i_ZERO, perm);
3582 #endif
3583 }
3584 
3585 template <int N, typename EnableIf = void>
3586 struct plogical_shift_left_impl;
3587 
3588 template <int N>
3589 struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3590  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3591  static const unsigned n = static_cast<unsigned>(N);
3592  const Packet4ui shift = {n, n, n, n};
3593  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3594  static const unsigned m = static_cast<unsigned>(32 - N);
3595  const Packet4ui shift_right = {m, m, m, m};
3596  const Packet4i out_hi = vec_sl(ai, shift);
3597  const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3598  return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3599  }
3600 };
3601 
3602 template <int N>
3603 struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
3604  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3605  static const unsigned m = static_cast<unsigned>(N - 32);
3606  const Packet4ui shift = {m, m, m, m};
3607  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3608  return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
3609  }
3610 };
3611 
3612 template <int N>
3613 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3614  return plogical_shift_left_impl<N>::run(a);
3615 }
3616 
3617 template <int N, typename EnableIf = void>
3618 struct plogical_shift_right_impl;
3619 
3620 template <int N>
3621 struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3622  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3623  static const unsigned n = static_cast<unsigned>(N);
3624  const Packet4ui shift = {n, n, n, n};
3625  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3626  static const unsigned m = static_cast<unsigned>(32 - N);
3627  const Packet4ui shift_left = {m, m, m, m};
3628  const Packet4i out_lo = vec_sr(ai, shift);
3629  const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3630  return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3631  }
3632 };
3633 
3634 template <int N>
3635 struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
3636  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3637  static const unsigned m = static_cast<unsigned>(N - 32);
3638  const Packet4ui shift = {m, m, m, m};
3639  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3640  return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
3641  }
3642 };
3643 
3644 template <int N>
3645 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3646  return plogical_shift_right_impl<N>::run(a);
3647 }
3648 #endif
3649 
3650 template <>
3651 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
3652  // Clamp exponent to [-2099, 2099]
3653  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3654  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3655 
3656  // Split 2^e into four factors and multiply:
3657  const Packet2l bias = {1023, 1023};
3658  Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
3659  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
3660  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
3661  b = psub(psub(psub(e, b), b), b); // e - 3b
3662  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
3663  out = pmul(out, c); // a * 2^e
3664  return out;
3665 }
3666 
3667 // Extract exponent without existence of Packet2l.
3668 template <>
3669 EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
3670  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
3671 }
3672 
3673 template <>
3674 EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
3675  return pfrexp_generic(a, exponent);
3676 }
3677 
3678 template <>
3679 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
3680  Packet2d b, sum;
3681  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
3682  sum = a + b;
3683  return pfirst<Packet2d>(sum);
3684 }
3685 
3686 // Other reduction functions:
3687 // mul
3688 template <>
3689 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
3690  return pfirst(
3691  pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3692 }
3693 
3694 // min
3695 template <>
3696 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
3697  return pfirst(
3698  pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3699 }
3700 
3701 // max
3702 template <>
3703 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
3704  return pfirst(
3705  pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3706 }
3707 
3708 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
3709  Packet2d t0, t1;
3710  t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3711  t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
3712  kernel.packet[0] = t0;
3713  kernel.packet[1] = t1;
3714 }
3715 
3716 template <>
3717 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
3718  const Packet2d& elsePacket) {
3719  Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
3720  Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
3721  return vec_sel(elsePacket, thenPacket, mask);
3722 }
3723 
3724 #endif // __VSX__
3725 } // end namespace internal
3726 
3727 } // end namespace Eigen
3728 
3729 #endif // EIGEN_PACKET_MATH_ALTIVEC_H
Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
Definition: BFloat16.h:231
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)