$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_GPU_H
11 #define EIGEN_PACKET_MATH_GPU_H
12 
13 // IWYU pragma: private
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 // Read-only data cached load available.
21 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
22 #define EIGEN_GPU_HAS_LDG 1
23 #endif
24 
25 // FP16 math available.
26 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
27 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
28 #endif
29 
30 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
31 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
32 #endif
33 
34 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
35 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
36 // of the functions, while the latter can only deal with one of them.
37 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
38 #define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
39 #else
40 #define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
41 #endif
42 
43 // Make sure this is only available when targeting a GPU: we don't want to
44 // introduce conflicts between these packet_traits definitions and the ones
45 // we'll use on the host side (SSE, AVX, ...)
46 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
47 
48 template <>
49 struct is_arithmetic<float4> {
50  enum { value = true };
51 };
52 template <>
53 struct is_arithmetic<double2> {
54  enum { value = true };
55 };
56 
57 template <>
58 struct packet_traits<float> : default_packet_traits {
59  typedef float4 type;
60  typedef float4 half;
61  enum {
62  Vectorizable = 1,
63  AlignedOnScalar = 1,
64  size = 4,
65 
66  HasDiv = 1,
67  HasSin = 0,
68  HasCos = 0,
69  HasLog = 1,
70  HasExp = 1,
71  HasSqrt = 1,
72  HasRsqrt = 1,
73  HasLGamma = 1,
74  HasDiGamma = 1,
75  HasZeta = 1,
76  HasPolygamma = 1,
77  HasErf = 1,
78  HasErfc = 1,
79  HasNdtri = 1,
80  HasBessel = 1,
81  HasIGamma = 1,
82  HasIGammaDerA = 1,
83  HasGammaSampleDerAlpha = 1,
84  HasIGammac = 1,
85  HasBetaInc = 1,
86 
87  HasBlend = 0,
88  HasFloor = 1,
89  HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
90  };
91 };
92 
93 template <>
94 struct packet_traits<double> : default_packet_traits {
95  typedef double2 type;
96  typedef double2 half;
97  enum {
98  Vectorizable = 1,
99  AlignedOnScalar = 1,
100  size = 2,
101 
102  HasDiv = 1,
103  HasLog = 1,
104  HasExp = 1,
105  HasSqrt = 1,
106  HasRsqrt = 1,
107  HasLGamma = 1,
108  HasDiGamma = 1,
109  HasZeta = 1,
110  HasPolygamma = 1,
111  HasErf = 1,
112  HasErfc = 1,
113  HasNdtri = 1,
114  HasBessel = 1,
115  HasIGamma = 1,
116  HasIGammaDerA = 1,
117  HasGammaSampleDerAlpha = 1,
118  HasIGammac = 1,
119  HasBetaInc = 1,
120  HasBlend = 0,
121  };
122 };
123 
124 template <>
125 struct unpacket_traits<float4> {
126  typedef float type;
127  enum {
128  size = 4,
129  alignment = Aligned16,
130  vectorizable = true,
131  masked_load_available = false,
132  masked_store_available = false
133  };
134  typedef float4 half;
135 };
136 template <>
137 struct unpacket_traits<double2> {
138  typedef double type;
139  enum {
140  size = 2,
141  alignment = Aligned16,
142  vectorizable = true,
143  masked_load_available = false,
144  masked_store_available = false
145  };
146  typedef double2 half;
147 };
148 
149 template <>
150 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
151  return make_float4(from, from, from, from);
152 }
153 template <>
154 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
155  return make_double2(from, from);
156 }
157 
158 #if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
159 
160 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
161  return __int_as_float(__float_as_int(a) & __float_as_int(b));
162 }
163 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
164  return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
165 }
166 
167 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
168  return __int_as_float(__float_as_int(a) | __float_as_int(b));
169 }
170 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
171  return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
172 }
173 
174 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
175  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
176 }
177 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
178  return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
179 }
180 
181 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
182  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
183 }
184 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
185  return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
186 }
187 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
188  return __int_as_float(a == b ? 0xffffffffu : 0u);
189 }
190 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
191  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
192 }
193 
194 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
195  return __int_as_float(a < b ? 0xffffffffu : 0u);
196 }
197 
198 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
199  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
200 }
201 
202 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
203  return __int_as_float(a <= b ? 0xffffffffu : 0u);
204 }
205 
206 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
207  return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
208 }
209 
210 template <>
211 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
212  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
213 }
214 template <>
215 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
216  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
217 }
218 
219 template <>
220 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
221  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
222 }
223 template <>
224 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
225  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
226 }
227 
228 template <>
229 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
230  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
231 }
232 template <>
233 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
234  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
235 }
236 
237 template <>
238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
239  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
240  bitwise_andnot(a.w, b.w));
241 }
242 template <>
243 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
244  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
245 }
246 
247 template <>
248 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
249  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
250 }
251 template <>
252 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
253  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
254 }
255 template <>
256 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
257  return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
258 }
259 template <>
260 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
261  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
262 }
263 template <>
264 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
265  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
266 }
267 template <>
268 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
269  return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
270 }
271 #endif // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
272 
273 template <>
274 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
275  return make_float4(a, a + 1, a + 2, a + 3);
276 }
277 template <>
278 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
279  return make_double2(a, a + 1);
280 }
281 
282 template <>
283 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
284  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
285 }
286 template <>
287 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
288  return make_double2(a.x + b.x, a.y + b.y);
289 }
290 
291 template <>
292 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
293  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
294 }
295 template <>
296 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
297  return make_double2(a.x - b.x, a.y - b.y);
298 }
299 
300 template <>
301 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
302  return make_float4(-a.x, -a.y, -a.z, -a.w);
303 }
304 template <>
305 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
306  return make_double2(-a.x, -a.y);
307 }
308 
309 template <>
310 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
311  return a;
312 }
313 template <>
314 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
315  return a;
316 }
317 
318 template <>
319 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
320  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
321 }
322 template <>
323 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
324  return make_double2(a.x * b.x, a.y * b.y);
325 }
326 
327 template <>
328 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
329  return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
330 }
331 template <>
332 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
333  return make_double2(a.x / b.x, a.y / b.y);
334 }
335 
336 template <>
337 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
338  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
339 }
340 template <>
341 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
342  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
343 }
344 
345 template <>
346 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
347  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
348 }
349 template <>
350 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
351  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
352 }
353 
354 template <>
355 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
356  return *reinterpret_cast<const float4*>(from);
357 }
358 
359 template <>
360 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
361  return *reinterpret_cast<const double2*>(from);
362 }
363 
364 template <>
365 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
366  return make_float4(from[0], from[1], from[2], from[3]);
367 }
368 template <>
369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
370  return make_double2(from[0], from[1]);
371 }
372 
373 template <>
374 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
375  return make_float4(from[0], from[0], from[1], from[1]);
376 }
377 template <>
378 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
379  return make_double2(from[0], from[0]);
380 }
381 
382 template <>
383 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
384  *reinterpret_cast<float4*>(to) = from;
385 }
386 
387 template <>
388 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
389  *reinterpret_cast<double2*>(to) = from;
390 }
391 
392 template <>
393 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
394  to[0] = from.x;
395  to[1] = from.y;
396  to[2] = from.z;
397  to[3] = from.w;
398 }
399 
400 template <>
401 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
402  to[0] = from.x;
403  to[1] = from.y;
404 }
405 
406 template <>
407 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
408 #if defined(EIGEN_GPU_HAS_LDG)
409  return __ldg(reinterpret_cast<const float4*>(from));
410 #else
411  return make_float4(from[0], from[1], from[2], from[3]);
412 #endif
413 }
414 template <>
415 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
416 #if defined(EIGEN_GPU_HAS_LDG)
417  return __ldg(reinterpret_cast<const double2*>(from));
418 #else
419  return make_double2(from[0], from[1]);
420 #endif
421 }
422 
423 template <>
424 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
425 #if defined(EIGEN_GPU_HAS_LDG)
426  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
427 #else
428  return make_float4(from[0], from[1], from[2], from[3]);
429 #endif
430 }
431 template <>
432 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
433 #if defined(EIGEN_GPU_HAS_LDG)
434  return make_double2(__ldg(from + 0), __ldg(from + 1));
435 #else
436  return make_double2(from[0], from[1]);
437 #endif
438 }
439 
440 template <>
441 EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
442  return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
443 }
444 
445 template <>
446 EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
447  return make_double2(from[0 * stride], from[1 * stride]);
448 }
449 
450 template <>
451 EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
452  to[stride * 0] = from.x;
453  to[stride * 1] = from.y;
454  to[stride * 2] = from.z;
455  to[stride * 3] = from.w;
456 }
457 template <>
458 EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
459  to[stride * 0] = from.x;
460  to[stride * 1] = from.y;
461 }
462 
463 template <>
464 EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
465  return a.x;
466 }
467 template <>
468 EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
469  return a.x;
470 }
471 
472 template <>
473 EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
474  return a.x + a.y + a.z + a.w;
475 }
476 template <>
477 EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
478  return a.x + a.y;
479 }
480 
481 template <>
482 EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
483  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
484 }
485 template <>
486 EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
487  return fmax(a.x, a.y);
488 }
489 
490 template <>
491 EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
492  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
493 }
494 template <>
495 EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
496  return fmin(a.x, a.y);
497 }
498 
499 template <>
500 EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
501  return a.x * a.y * a.z * a.w;
502 }
503 template <>
504 EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
505  return a.x * a.y;
506 }
507 
508 template <>
509 EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
510  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
511 }
512 template <>
513 EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
514  return make_double2(fabs(a.x), fabs(a.y));
515 }
516 
517 template <>
518 EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
519  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
520 }
521 template <>
522 EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
523  return make_double2(floor(a.x), floor(a.y));
524 }
525 
526 template <>
527 EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
528  return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
529 }
530 template <>
531 EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
532  return make_double2(ceil(a.x), ceil(a.y));
533 }
534 
535 template <>
536 EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
537  return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
538 }
539 template <>
540 EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
541  return make_double2(rint(a.x), rint(a.y));
542 }
543 
544 template <>
545 EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
546  return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
547 }
548 template <>
549 EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
550  return make_double2(trunc(a.x), trunc(a.y));
551 }
552 
553 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
554  float tmp = kernel.packet[0].y;
555  kernel.packet[0].y = kernel.packet[1].x;
556  kernel.packet[1].x = tmp;
557 
558  tmp = kernel.packet[0].z;
559  kernel.packet[0].z = kernel.packet[2].x;
560  kernel.packet[2].x = tmp;
561 
562  tmp = kernel.packet[0].w;
563  kernel.packet[0].w = kernel.packet[3].x;
564  kernel.packet[3].x = tmp;
565 
566  tmp = kernel.packet[1].z;
567  kernel.packet[1].z = kernel.packet[2].y;
568  kernel.packet[2].y = tmp;
569 
570  tmp = kernel.packet[1].w;
571  kernel.packet[1].w = kernel.packet[3].y;
572  kernel.packet[3].y = tmp;
573 
574  tmp = kernel.packet[2].w;
575  kernel.packet[2].w = kernel.packet[3].z;
576  kernel.packet[3].z = tmp;
577 }
578 
579 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
580  double tmp = kernel.packet[0].y;
581  kernel.packet[0].y = kernel.packet[1].x;
582  kernel.packet[1].x = tmp;
583 }
584 
585 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
586 
587 // Half-packet functions are not available on the host for CUDA 9.0-9.2, only
588 // on device. There is no benefit to using them on the host anyways, since they are
589 // emulated.
590 #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
591 
592 typedef ulonglong2 Packet4h2;
593 template <>
594 struct unpacket_traits<Packet4h2> {
595  typedef Eigen::half type;
596  enum {
597  size = 8,
598  alignment = Aligned16,
599  vectorizable = true,
600  masked_load_available = false,
601  masked_store_available = false
602  };
603  typedef Packet4h2 half;
604 };
605 template <>
606 struct is_arithmetic<Packet4h2> {
607  enum { value = true };
608 };
609 
610 template <>
611 struct unpacket_traits<half2> {
612  typedef Eigen::half type;
613  enum {
614  size = 2,
615  alignment = Aligned16,
616  vectorizable = true,
617  masked_load_available = false,
618  masked_store_available = false
619  };
620  typedef half2 half;
621 };
622 template <>
623 struct is_arithmetic<half2> {
624  enum { value = true };
625 };
626 
627 template <>
628 struct packet_traits<Eigen::half> : default_packet_traits {
629  typedef Packet4h2 type;
630  typedef Packet4h2 half;
631  enum {
632  Vectorizable = 1,
633  AlignedOnScalar = 1,
634  size = 8,
635  HasAdd = 1,
636  HasSub = 1,
637  HasMul = 1,
638  HasDiv = 1,
639  HasSqrt = 1,
640  HasRsqrt = 1,
641  HasExp = 1,
642  HasExpm1 = 1,
643  HasLog = 1,
644  HasLog1p = 1
645  };
646 };
647 
648 template <>
649 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
650  return __half2half2(from);
651 }
652 
653 template <>
654 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
655  Packet4h2 r;
656  half2* p_alias = reinterpret_cast<half2*>(&r);
657  p_alias[0] = pset1<half2>(from);
658  p_alias[1] = pset1<half2>(from);
659  p_alias[2] = pset1<half2>(from);
660  p_alias[3] = pset1<half2>(from);
661  return r;
662 }
663 
664 namespace {
665 
666 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
667  return *reinterpret_cast<const half2*>(from);
668 }
669 
670 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
671 
672 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
673  return __halves2half2(from[0], from[0]);
674 }
675 
676 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
677  *reinterpret_cast<half2*>(to) = from;
678 }
679 
680 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
681  to[0] = __low2half(from);
682  to[1] = __high2half(from);
683 }
684 
685 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
686 #if defined(EIGEN_GPU_HAS_LDG)
687  // Input is guaranteed to be properly aligned.
688  return __ldg(reinterpret_cast<const half2*>(from));
689 #else
690  return __halves2half2(*(from + 0), *(from + 1));
691 #endif
692 }
693 
694 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
695 #if defined(EIGEN_GPU_HAS_LDG)
696  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
697 #else
698  return __halves2half2(*(from + 0), *(from + 1));
699 #endif
700 }
701 
702 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
703  return __halves2half2(from[0 * stride], from[1 * stride]);
704 }
705 
706 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
707  to[stride * 0] = __low2half(from);
708  to[stride * 1] = __high2half(from);
709 }
710 
711 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
712 
713 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
714  half a1 = __low2half(a);
715  half a2 = __high2half(a);
716  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
717  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
718  return __halves2half2(result1, result2);
719 }
720 
721 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
722  half true_half = half_impl::raw_uint16_to_half(0xffffu);
723  return pset1<half2>(true_half);
724 }
725 
726 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
727  half false_half = half_impl::raw_uint16_to_half(0x0000u);
728  return pset1<half2>(false_half);
729 }
730 
731 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
732  __half a1 = __low2half(kernel.packet[0]);
733  __half a2 = __high2half(kernel.packet[0]);
734  __half b1 = __low2half(kernel.packet[1]);
735  __half b2 = __high2half(kernel.packet[1]);
736  kernel.packet[0] = __halves2half2(a1, b1);
737  kernel.packet[1] = __halves2half2(a2, b2);
738 }
739 
740 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
741 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
742  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
743 #else
744  float f = __half2float(a) + 1.0f;
745  return __halves2half2(a, __float2half(f));
746 #endif
747 }
748 
749 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
750  half mask_low = __low2half(mask);
751  half mask_high = __high2half(mask);
752  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
753  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
754  return __halves2half2(result_low, result_high);
755 }
756 
757 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
758  half true_half = half_impl::raw_uint16_to_half(0xffffu);
759  half false_half = half_impl::raw_uint16_to_half(0x0000u);
760  half a1 = __low2half(a);
761  half a2 = __high2half(a);
762  half b1 = __low2half(b);
763  half b2 = __high2half(b);
764  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
765  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
766  return __halves2half2(eq1, eq2);
767 }
768 
769 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
770  half true_half = half_impl::raw_uint16_to_half(0xffffu);
771  half false_half = half_impl::raw_uint16_to_half(0x0000u);
772  half a1 = __low2half(a);
773  half a2 = __high2half(a);
774  half b1 = __low2half(b);
775  half b2 = __high2half(b);
776  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
777  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
778  return __halves2half2(eq1, eq2);
779 }
780 
781 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
782  half true_half = half_impl::raw_uint16_to_half(0xffffu);
783  half false_half = half_impl::raw_uint16_to_half(0x0000u);
784  half a1 = __low2half(a);
785  half a2 = __high2half(a);
786  half b1 = __low2half(b);
787  half b2 = __high2half(b);
788  half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
789  half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
790  return __halves2half2(eq1, eq2);
791 }
792 
793 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
794  half a1 = __low2half(a);
795  half a2 = __high2half(a);
796  half b1 = __low2half(b);
797  half b2 = __high2half(b);
798  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
799  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
800  return __halves2half2(result1, result2);
801 }
802 
803 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
804  half a1 = __low2half(a);
805  half a2 = __high2half(a);
806  half b1 = __low2half(b);
807  half b2 = __high2half(b);
808  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
809  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
810  return __halves2half2(result1, result2);
811 }
812 
813 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
814  half a1 = __low2half(a);
815  half a2 = __high2half(a);
816  half b1 = __low2half(b);
817  half b2 = __high2half(b);
818  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
819  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
820  return __halves2half2(result1, result2);
821 }
822 
823 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
824  half a1 = __low2half(a);
825  half a2 = __high2half(a);
826  half b1 = __low2half(b);
827  half b2 = __high2half(b);
828  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
829  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
830  return __halves2half2(result1, result2);
831 }
832 
833 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
834 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
835  return __hadd2(a, b);
836 #else
837  float a1 = __low2float(a);
838  float a2 = __high2float(a);
839  float b1 = __low2float(b);
840  float b2 = __high2float(b);
841  float r1 = a1 + b1;
842  float r2 = a2 + b2;
843  return __floats2half2_rn(r1, r2);
844 #endif
845 }
846 
847 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
848 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
849  return __hsub2(a, b);
850 #else
851  float a1 = __low2float(a);
852  float a2 = __high2float(a);
853  float b1 = __low2float(b);
854  float b2 = __high2float(b);
855  float r1 = a1 - b1;
856  float r2 = a2 - b2;
857  return __floats2half2_rn(r1, r2);
858 #endif
859 }
860 
861 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
862 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
863  return __hneg2(a);
864 #else
865  float a1 = __low2float(a);
866  float a2 = __high2float(a);
867  return __floats2half2_rn(-a1, -a2);
868 #endif
869 }
870 
871 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
872 
873 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
874 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
875  return __hmul2(a, b);
876 #else
877  float a1 = __low2float(a);
878  float a2 = __high2float(a);
879  float b1 = __low2float(b);
880  float b2 = __high2float(b);
881  float r1 = a1 * b1;
882  float r2 = a2 * b2;
883  return __floats2half2_rn(r1, r2);
884 #endif
885 }
886 
887 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
888 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
889  return __hfma2(a, b, c);
890 #else
891  float a1 = __low2float(a);
892  float a2 = __high2float(a);
893  float b1 = __low2float(b);
894  float b2 = __high2float(b);
895  float c1 = __low2float(c);
896  float c2 = __high2float(c);
897  float r1 = a1 * b1 + c1;
898  float r2 = a2 * b2 + c2;
899  return __floats2half2_rn(r1, r2);
900 #endif
901 }
902 
903 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
904 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
905  return __h2div(a, b);
906 #else
907  float a1 = __low2float(a);
908  float a2 = __high2float(a);
909  float b1 = __low2float(b);
910  float b2 = __high2float(b);
911  float r1 = a1 / b1;
912  float r2 = a2 / b2;
913  return __floats2half2_rn(r1, r2);
914 #endif
915 }
916 
917 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
918  float a1 = __low2float(a);
919  float a2 = __high2float(a);
920  float b1 = __low2float(b);
921  float b2 = __high2float(b);
922  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
923  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
924  return __halves2half2(r1, r2);
925 }
926 
927 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
928  float a1 = __low2float(a);
929  float a2 = __high2float(a);
930  float b1 = __low2float(b);
931  float b2 = __high2float(b);
932  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
933  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
934  return __halves2half2(r1, r2);
935 }
936 
937 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
938 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
939  return __hadd(__low2half(a), __high2half(a));
940 #else
941  float a1 = __low2float(a);
942  float a2 = __high2float(a);
943  return Eigen::half(__float2half(a1 + a2));
944 #endif
945 }
946 
947 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
948 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
949  __half first = __low2half(a);
950  __half second = __high2half(a);
951  return __hgt(first, second) ? first : second;
952 #else
953  float a1 = __low2float(a);
954  float a2 = __high2float(a);
955  return a1 > a2 ? __low2half(a) : __high2half(a);
956 #endif
957 }
958 
959 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
960 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
961  __half first = __low2half(a);
962  __half second = __high2half(a);
963  return __hlt(first, second) ? first : second;
964 #else
965  float a1 = __low2float(a);
966  float a2 = __high2float(a);
967  return a1 < a2 ? __low2half(a) : __high2half(a);
968 #endif
969 }
970 
971 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
972 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
973  return __hmul(__low2half(a), __high2half(a));
974 #else
975  float a1 = __low2float(a);
976  float a2 = __high2float(a);
977  return Eigen::half(__float2half(a1 * a2));
978 #endif
979 }
980 
981 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
982  float a1 = __low2float(a);
983  float a2 = __high2float(a);
984  float r1 = log1pf(a1);
985  float r2 = log1pf(a2);
986  return __floats2half2_rn(r1, r2);
987 }
988 
989 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
990  float a1 = __low2float(a);
991  float a2 = __high2float(a);
992  float r1 = expm1f(a1);
993  float r2 = expm1f(a2);
994  return __floats2half2_rn(r1, r2);
995 }
996 
997 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
998 
999 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
1000 
1001 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
1002 
1003 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
1004 
1005 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
1006 
1007 #else
1008 
1009 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
1010  float a1 = __low2float(a);
1011  float a2 = __high2float(a);
1012  float r1 = logf(a1);
1013  float r2 = logf(a2);
1014  return __floats2half2_rn(r1, r2);
1015 }
1016 
1017 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
1018  float a1 = __low2float(a);
1019  float a2 = __high2float(a);
1020  float r1 = expf(a1);
1021  float r2 = expf(a2);
1022  return __floats2half2_rn(r1, r2);
1023 }
1024 
1025 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
1026  float a1 = __low2float(a);
1027  float a2 = __high2float(a);
1028  float r1 = sqrtf(a1);
1029  float r2 = sqrtf(a2);
1030  return __floats2half2_rn(r1, r2);
1031 }
1032 
1033 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
1034  float a1 = __low2float(a);
1035  float a2 = __high2float(a);
1036  float r1 = rsqrtf(a1);
1037  float r2 = rsqrtf(a2);
1038  return __floats2half2_rn(r1, r2);
1039 }
1040 #endif
1041 } // namespace
1042 
1043 template <>
1044 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
1045  return *reinterpret_cast<const Packet4h2*>(from);
1046 }
1047 
1048 // unaligned load;
1049 template <>
1050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
1051  Packet4h2 r;
1052  half2* p_alias = reinterpret_cast<half2*>(&r);
1053  p_alias[0] = ploadu(from + 0);
1054  p_alias[1] = ploadu(from + 2);
1055  p_alias[2] = ploadu(from + 4);
1056  p_alias[3] = ploadu(from + 6);
1057  return r;
1058 }
1059 
1060 template <>
1061 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
1062  Packet4h2 r;
1063  half2* p_alias = reinterpret_cast<half2*>(&r);
1064  p_alias[0] = ploaddup(from + 0);
1065  p_alias[1] = ploaddup(from + 1);
1066  p_alias[2] = ploaddup(from + 2);
1067  p_alias[3] = ploaddup(from + 3);
1068  return r;
1069 }
1070 
1071 template <>
1072 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
1073  *reinterpret_cast<Packet4h2*>(to) = from;
1074 }
1075 
1076 template <>
1077 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
1078  const half2* from_alias = reinterpret_cast<const half2*>(&from);
1079  pstoreu(to + 0, from_alias[0]);
1080  pstoreu(to + 2, from_alias[1]);
1081  pstoreu(to + 4, from_alias[2]);
1082  pstoreu(to + 6, from_alias[3]);
1083 }
1084 
1085 template <>
1086 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1087 #if defined(EIGEN_GPU_HAS_LDG)
1088  Packet4h2 r;
1089  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
1090  return r;
1091 #else
1092  Packet4h2 r;
1093  half2* r_alias = reinterpret_cast<half2*>(&r);
1094  r_alias[0] = ploadt_ro_aligned(from + 0);
1095  r_alias[1] = ploadt_ro_aligned(from + 2);
1096  r_alias[2] = ploadt_ro_aligned(from + 4);
1097  r_alias[3] = ploadt_ro_aligned(from + 6);
1098  return r;
1099 #endif
1100 }
1101 
1102 template <>
1103 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1104  Packet4h2 r;
1105  half2* r_alias = reinterpret_cast<half2*>(&r);
1106  r_alias[0] = ploadt_ro_unaligned(from + 0);
1107  r_alias[1] = ploadt_ro_unaligned(from + 2);
1108  r_alias[2] = ploadt_ro_unaligned(from + 4);
1109  r_alias[3] = ploadt_ro_unaligned(from + 6);
1110  return r;
1111 }
1112 
1113 template <>
1114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
1115  Packet4h2 r;
1116  half2* p_alias = reinterpret_cast<half2*>(&r);
1117  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
1118  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
1119  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
1120  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
1121  return r;
1122 }
1123 
1124 template <>
1125 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
1126  Index stride) {
1127  const half2* from_alias = reinterpret_cast<const half2*>(&from);
1128  pscatter(to + stride * 0, from_alias[0], stride);
1129  pscatter(to + stride * 2, from_alias[1], stride);
1130  pscatter(to + stride * 4, from_alias[2], stride);
1131  pscatter(to + stride * 6, from_alias[3], stride);
1132 }
1133 
1134 template <>
1135 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
1136  return pfirst(*(reinterpret_cast<const half2*>(&a)));
1137 }
1138 
1139 template <>
1140 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
1141  Packet4h2 r;
1142  half2* p_alias = reinterpret_cast<half2*>(&r);
1143  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1144  p_alias[0] = pabs(a_alias[0]);
1145  p_alias[1] = pabs(a_alias[1]);
1146  p_alias[2] = pabs(a_alias[2]);
1147  p_alias[3] = pabs(a_alias[3]);
1148  return r;
1149 }
1150 
1151 template <>
1152 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
1153  half true_half = half_impl::raw_uint16_to_half(0xffffu);
1154  return pset1<Packet4h2>(true_half);
1155 }
1156 
1157 template <>
1158 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
1159  half false_half = half_impl::raw_uint16_to_half(0x0000u);
1160  return pset1<Packet4h2>(false_half);
1161 }
1162 
1163 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
1164  double* d_row3, double* d_row4, double* d_row5,
1165  double* d_row6, double* d_row7) {
1166  double d_tmp;
1167  d_tmp = d_row0[1];
1168  d_row0[1] = d_row4[0];
1169  d_row4[0] = d_tmp;
1170 
1171  d_tmp = d_row1[1];
1172  d_row1[1] = d_row5[0];
1173  d_row5[0] = d_tmp;
1174 
1175  d_tmp = d_row2[1];
1176  d_row2[1] = d_row6[0];
1177  d_row6[0] = d_tmp;
1178 
1179  d_tmp = d_row3[1];
1180  d_row3[1] = d_row7[0];
1181  d_row7[0] = d_tmp;
1182 }
1183 
1184 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
1185  half2* f_row3) {
1186  half2 f_tmp;
1187  f_tmp = f_row0[1];
1188  f_row0[1] = f_row2[0];
1189  f_row2[0] = f_tmp;
1190 
1191  f_tmp = f_row1[1];
1192  f_row1[1] = f_row3[0];
1193  f_row3[0] = f_tmp;
1194 }
1195 
1196 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
1197  __half a1 = __low2half(f0);
1198  __half a2 = __high2half(f0);
1199  __half b1 = __low2half(f1);
1200  __half b2 = __high2half(f1);
1201  f0 = __halves2half2(a1, b1);
1202  f1 = __halves2half2(a2, b2);
1203 }
1204 
1205 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
1206  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
1207  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
1208  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
1209  double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
1210  double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
1211  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
1212  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
1213  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
1214  ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
1215 
1216  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
1217  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
1218  half2* f_row2 = reinterpret_cast<half2*>(d_row2);
1219  half2* f_row3 = reinterpret_cast<half2*>(d_row3);
1220  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1221  ptranspose_half(f_row0[0], f_row1[0]);
1222  ptranspose_half(f_row0[1], f_row1[1]);
1223  ptranspose_half(f_row2[0], f_row3[0]);
1224  ptranspose_half(f_row2[1], f_row3[1]);
1225 
1226  f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
1227  f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
1228  f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
1229  f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
1230  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1231  ptranspose_half(f_row0[0], f_row1[0]);
1232  ptranspose_half(f_row0[1], f_row1[1]);
1233  ptranspose_half(f_row2[0], f_row3[0]);
1234  ptranspose_half(f_row2[1], f_row3[1]);
1235 
1236  f_row0 = reinterpret_cast<half2*>(d_row4);
1237  f_row1 = reinterpret_cast<half2*>(d_row5);
1238  f_row2 = reinterpret_cast<half2*>(d_row6);
1239  f_row3 = reinterpret_cast<half2*>(d_row7);
1240  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1241  ptranspose_half(f_row0[0], f_row1[0]);
1242  ptranspose_half(f_row0[1], f_row1[1]);
1243  ptranspose_half(f_row2[0], f_row3[0]);
1244  ptranspose_half(f_row2[1], f_row3[1]);
1245 
1246  f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
1247  f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
1248  f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
1249  f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
1250  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1251  ptranspose_half(f_row0[0], f_row1[0]);
1252  ptranspose_half(f_row0[1], f_row1[1]);
1253  ptranspose_half(f_row2[0], f_row3[0]);
1254  ptranspose_half(f_row2[1], f_row3[1]);
1255 }
1256 
1257 template <>
1258 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
1259 #if defined(EIGEN_HIP_DEVICE_COMPILE)
1260 
1261  Packet4h2 r;
1262  half2* p_alias = reinterpret_cast<half2*>(&r);
1263  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1264  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
1265  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
1266  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
1267  return r;
1268 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1269  Packet4h2 r;
1270  half2* r_alias = reinterpret_cast<half2*>(&r);
1271 
1272  half2 b = pset1<half2>(a);
1273  half2 c;
1274  half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
1275  half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
1276 
1277  c = __hadd2(b, half_offset0);
1278  r_alias[0] = plset(__low2half(c));
1279  r_alias[1] = plset(__high2half(c));
1280 
1281  c = __hadd2(b, half_offset1);
1282  r_alias[2] = plset(__low2half(c));
1283  r_alias[3] = plset(__high2half(c));
1284 
1285  return r;
1286 
1287 #else
1288  float f = __half2float(a);
1289  Packet4h2 r;
1290  half2* p_alias = reinterpret_cast<half2*>(&r);
1291  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
1292  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
1293  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
1294  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
1295  return r;
1296 #endif
1297 }
1298 
1299 template <>
1300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1301  const Packet4h2& b) {
1302  Packet4h2 r;
1303  half2* r_alias = reinterpret_cast<half2*>(&r);
1304  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
1305  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1306  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1307  r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
1308  r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
1309  r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
1310  r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
1311  return r;
1312 }
1313 
1314 template <>
1315 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1316  Packet4h2 r;
1317  half2* r_alias = reinterpret_cast<half2*>(&r);
1318  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1319  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1320  r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
1321  r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
1322  r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
1323  r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
1324  return r;
1325 }
1326 
1327 template <>
1328 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1329  Packet4h2 r;
1330  half2* r_alias = reinterpret_cast<half2*>(&r);
1331  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1332  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1333  r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
1334  r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
1335  r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
1336  r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
1337  return r;
1338 }
1339 
1340 template <>
1341 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1342  Packet4h2 r;
1343  half2* r_alias = reinterpret_cast<half2*>(&r);
1344  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1345  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1346  r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
1347  r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
1348  r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
1349  r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
1350  return r;
1351 }
1352 
1353 template <>
1354 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1355  Packet4h2 r;
1356  half2* r_alias = reinterpret_cast<half2*>(&r);
1357  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1358  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1359  r_alias[0] = pand(a_alias[0], b_alias[0]);
1360  r_alias[1] = pand(a_alias[1], b_alias[1]);
1361  r_alias[2] = pand(a_alias[2], b_alias[2]);
1362  r_alias[3] = pand(a_alias[3], b_alias[3]);
1363  return r;
1364 }
1365 
1366 template <>
1367 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1368  Packet4h2 r;
1369  half2* r_alias = reinterpret_cast<half2*>(&r);
1370  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1371  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1372  r_alias[0] = por(a_alias[0], b_alias[0]);
1373  r_alias[1] = por(a_alias[1], b_alias[1]);
1374  r_alias[2] = por(a_alias[2], b_alias[2]);
1375  r_alias[3] = por(a_alias[3], b_alias[3]);
1376  return r;
1377 }
1378 
1379 template <>
1380 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1381  Packet4h2 r;
1382  half2* r_alias = reinterpret_cast<half2*>(&r);
1383  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1384  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1385  r_alias[0] = pxor(a_alias[0], b_alias[0]);
1386  r_alias[1] = pxor(a_alias[1], b_alias[1]);
1387  r_alias[2] = pxor(a_alias[2], b_alias[2]);
1388  r_alias[3] = pxor(a_alias[3], b_alias[3]);
1389  return r;
1390 }
1391 
1392 template <>
1393 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1394  Packet4h2 r;
1395  half2* r_alias = reinterpret_cast<half2*>(&r);
1396  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1397  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1398  r_alias[0] = pandnot(a_alias[0], b_alias[0]);
1399  r_alias[1] = pandnot(a_alias[1], b_alias[1]);
1400  r_alias[2] = pandnot(a_alias[2], b_alias[2]);
1401  r_alias[3] = pandnot(a_alias[3], b_alias[3]);
1402  return r;
1403 }
1404 
1405 template <>
1406 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1407  Packet4h2 r;
1408  half2* r_alias = reinterpret_cast<half2*>(&r);
1409  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1410  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1411  r_alias[0] = padd(a_alias[0], b_alias[0]);
1412  r_alias[1] = padd(a_alias[1], b_alias[1]);
1413  r_alias[2] = padd(a_alias[2], b_alias[2]);
1414  r_alias[3] = padd(a_alias[3], b_alias[3]);
1415  return r;
1416 }
1417 
1418 template <>
1419 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1420  Packet4h2 r;
1421  half2* r_alias = reinterpret_cast<half2*>(&r);
1422  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1423  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1424  r_alias[0] = psub(a_alias[0], b_alias[0]);
1425  r_alias[1] = psub(a_alias[1], b_alias[1]);
1426  r_alias[2] = psub(a_alias[2], b_alias[2]);
1427  r_alias[3] = psub(a_alias[3], b_alias[3]);
1428  return r;
1429 }
1430 
1431 template <>
1432 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
1433  Packet4h2 r;
1434  half2* r_alias = reinterpret_cast<half2*>(&r);
1435  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1436  r_alias[0] = pnegate(a_alias[0]);
1437  r_alias[1] = pnegate(a_alias[1]);
1438  r_alias[2] = pnegate(a_alias[2]);
1439  r_alias[3] = pnegate(a_alias[3]);
1440  return r;
1441 }
1442 
1443 template <>
1444 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
1445  return a;
1446 }
1447 
1448 template <>
1449 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1450  Packet4h2 r;
1451  half2* r_alias = reinterpret_cast<half2*>(&r);
1452  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1453  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1454  r_alias[0] = pmul(a_alias[0], b_alias[0]);
1455  r_alias[1] = pmul(a_alias[1], b_alias[1]);
1456  r_alias[2] = pmul(a_alias[2], b_alias[2]);
1457  r_alias[3] = pmul(a_alias[3], b_alias[3]);
1458  return r;
1459 }
1460 
1461 template <>
1462 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
1463  const Packet4h2& c) {
1464  Packet4h2 r;
1465  half2* r_alias = reinterpret_cast<half2*>(&r);
1466  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1467  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1468  const half2* c_alias = reinterpret_cast<const half2*>(&c);
1469  r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
1470  r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
1471  r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
1472  r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
1473  return r;
1474 }
1475 
1476 template <>
1477 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1478  Packet4h2 r;
1479  half2* r_alias = reinterpret_cast<half2*>(&r);
1480  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1481  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1482  r_alias[0] = pdiv(a_alias[0], b_alias[0]);
1483  r_alias[1] = pdiv(a_alias[1], b_alias[1]);
1484  r_alias[2] = pdiv(a_alias[2], b_alias[2]);
1485  r_alias[3] = pdiv(a_alias[3], b_alias[3]);
1486  return r;
1487 }
1488 
1489 template <>
1490 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1491  Packet4h2 r;
1492  half2* r_alias = reinterpret_cast<half2*>(&r);
1493  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1494  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1495  r_alias[0] = pmin(a_alias[0], b_alias[0]);
1496  r_alias[1] = pmin(a_alias[1], b_alias[1]);
1497  r_alias[2] = pmin(a_alias[2], b_alias[2]);
1498  r_alias[3] = pmin(a_alias[3], b_alias[3]);
1499  return r;
1500 }
1501 
1502 template <>
1503 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1504  Packet4h2 r;
1505  half2* r_alias = reinterpret_cast<half2*>(&r);
1506  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1507  const half2* b_alias = reinterpret_cast<const half2*>(&b);
1508  r_alias[0] = pmax(a_alias[0], b_alias[0]);
1509  r_alias[1] = pmax(a_alias[1], b_alias[1]);
1510  r_alias[2] = pmax(a_alias[2], b_alias[2]);
1511  r_alias[3] = pmax(a_alias[3], b_alias[3]);
1512  return r;
1513 }
1514 
1515 template <>
1516 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
1517  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1518 
1519  return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
1520 }
1521 
1522 template <>
1523 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
1524  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1525  half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
1526  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
1527  __half first = predux_max(m0);
1528  __half second = predux_max(m1);
1529 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1530  return (__hgt(first, second) ? first : second);
1531 #else
1532  float ffirst = __half2float(first);
1533  float fsecond = __half2float(second);
1534  return (ffirst > fsecond) ? first : second;
1535 #endif
1536 }
1537 
1538 template <>
1539 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
1540  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1541  half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
1542  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
1543  __half first = predux_min(m0);
1544  __half second = predux_min(m1);
1545 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1546  return (__hlt(first, second) ? first : second);
1547 #else
1548  float ffirst = __half2float(first);
1549  float fsecond = __half2float(second);
1550  return (ffirst < fsecond) ? first : second;
1551 #endif
1552 }
1553 
1554 // likely overflow/underflow
1555 template <>
1556 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
1557  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1558  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
1559 }
1560 
1561 template <>
1562 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
1563  Packet4h2 r;
1564  half2* r_alias = reinterpret_cast<half2*>(&r);
1565  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1566  r_alias[0] = plog1p(a_alias[0]);
1567  r_alias[1] = plog1p(a_alias[1]);
1568  r_alias[2] = plog1p(a_alias[2]);
1569  r_alias[3] = plog1p(a_alias[3]);
1570  return r;
1571 }
1572 
1573 template <>
1574 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
1575  Packet4h2 r;
1576  half2* r_alias = reinterpret_cast<half2*>(&r);
1577  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1578  r_alias[0] = pexpm1(a_alias[0]);
1579  r_alias[1] = pexpm1(a_alias[1]);
1580  r_alias[2] = pexpm1(a_alias[2]);
1581  r_alias[3] = pexpm1(a_alias[3]);
1582  return r;
1583 }
1584 
1585 template <>
1586 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
1587  Packet4h2 r;
1588  half2* r_alias = reinterpret_cast<half2*>(&r);
1589  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1590  r_alias[0] = plog(a_alias[0]);
1591  r_alias[1] = plog(a_alias[1]);
1592  r_alias[2] = plog(a_alias[2]);
1593  r_alias[3] = plog(a_alias[3]);
1594  return r;
1595 }
1596 
1597 template <>
1598 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
1599  Packet4h2 r;
1600  half2* r_alias = reinterpret_cast<half2*>(&r);
1601  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1602  r_alias[0] = pexp(a_alias[0]);
1603  r_alias[1] = pexp(a_alias[1]);
1604  r_alias[2] = pexp(a_alias[2]);
1605  r_alias[3] = pexp(a_alias[3]);
1606  return r;
1607 }
1608 
1609 template <>
1610 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
1611  Packet4h2 r;
1612  half2* r_alias = reinterpret_cast<half2*>(&r);
1613  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1614  r_alias[0] = psqrt(a_alias[0]);
1615  r_alias[1] = psqrt(a_alias[1]);
1616  r_alias[2] = psqrt(a_alias[2]);
1617  r_alias[3] = psqrt(a_alias[3]);
1618  return r;
1619 }
1620 
1621 template <>
1622 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
1623  Packet4h2 r;
1624  half2* r_alias = reinterpret_cast<half2*>(&r);
1625  const half2* a_alias = reinterpret_cast<const half2*>(&a);
1626  r_alias[0] = prsqrt(a_alias[0]);
1627  r_alias[1] = prsqrt(a_alias[1]);
1628  r_alias[2] = prsqrt(a_alias[2]);
1629  r_alias[3] = prsqrt(a_alias[3]);
1630  return r;
1631 }
1632 
1633 // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
1634 // the implementation of GPU half reduction.
1635 template <>
1636 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
1637 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1638  return __hadd2(a, b);
1639 #else
1640  float a1 = __low2float(a);
1641  float a2 = __high2float(a);
1642  float b1 = __low2float(b);
1643  float b2 = __high2float(b);
1644  float r1 = a1 + b1;
1645  float r2 = a2 + b2;
1646  return __floats2half2_rn(r1, r2);
1647 #endif
1648 }
1649 
1650 template <>
1651 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
1652 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1653  return __hmul2(a, b);
1654 #else
1655  float a1 = __low2float(a);
1656  float a2 = __high2float(a);
1657  float b1 = __low2float(b);
1658  float b2 = __high2float(b);
1659  float r1 = a1 * b1;
1660  float r2 = a2 * b2;
1661  return __floats2half2_rn(r1, r2);
1662 #endif
1663 }
1664 
1665 template <>
1666 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
1667 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1668  return __h2div(a, b);
1669 #else
1670  float a1 = __low2float(a);
1671  float a2 = __high2float(a);
1672  float b1 = __low2float(b);
1673  float b2 = __high2float(b);
1674  float r1 = a1 / b1;
1675  float r2 = a2 / b2;
1676  return __floats2half2_rn(r1, r2);
1677 #endif
1678 }
1679 
1680 template <>
1681 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
1682  float a1 = __low2float(a);
1683  float a2 = __high2float(a);
1684  float b1 = __low2float(b);
1685  float b2 = __high2float(b);
1686  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
1687  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
1688  return __halves2half2(r1, r2);
1689 }
1690 
1691 template <>
1692 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
1693  float a1 = __low2float(a);
1694  float a2 = __high2float(a);
1695  float b1 = __low2float(b);
1696  float b2 = __high2float(b);
1697  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
1698  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
1699  return __halves2half2(r1, r2);
1700 }
1701 
1702 #endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
1703 
1704 #undef EIGEN_GPU_HAS_LDG
1705 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1706 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
1707 
1708 } // end namespace internal
1709 
1710 } // end namespace Eigen
1711 
1712 #endif // EIGEN_PACKET_MATH_GPU_H
Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_rint_op< typename Derived::Scalar >, const Derived > rint(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_floor_op< typename Derived::Scalar >, const Derived > floor(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_trunc_op< typename Derived::Scalar >, const Derived > trunc(const Eigen::ArrayBase< Derived > &x)