$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
5 // Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_PACKET_MATH_LSX_H
12 #define EIGEN_PACKET_MATH_LSX_H
13 
14 // IWYU pragma: private
15 #include "../../InternalHeaderCheck.h"
16 
17 namespace Eigen {
18 
19 namespace internal {
20 
21 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23 #endif
24 
25 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
26 #if EIGEN_ARCH_LOONGARCH64
27 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28 #endif
29 #endif
30 
31 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33 #endif
34 
35 typedef __m128 Packet4f;
36 typedef __m128d Packet2d;
37 
38 typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
39 typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
40 typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
41 typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
42 typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
43 typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
44 typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
45 typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
46 
47 template <>
48 struct is_arithmetic<__m128> {
49  enum { value = true };
50 };
51 template <>
52 struct is_arithmetic<__m128i> {
53  enum { value = true };
54 };
55 template <>
56 struct is_arithmetic<__m128d> {
57  enum { value = true };
58 };
59 template <>
60 struct is_arithmetic<Packet16c> {
61  enum { value = true };
62 };
63 template <>
64 struct is_arithmetic<Packet8s> {
65  enum { value = true };
66 };
67 template <>
68 struct is_arithmetic<Packet4i> {
69  enum { value = true };
70 };
71 template <>
72 struct is_arithmetic<Packet2l> {
73  enum { value = true };
74 };
75 template <>
76 struct is_arithmetic<Packet16uc> {
77  enum { value = false };
78 };
79 template <>
80 struct is_arithmetic<Packet8us> {
81  enum { value = false };
82 };
83 template <>
84 struct is_arithmetic<Packet4ui> {
85  enum { value = false };
86 };
87 template <>
88 struct is_arithmetic<Packet2ul> {
89  enum { value = false };
90 };
91 
92 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
93  float from[4] = {a, b, c, d};
94  return (Packet4f)__lsx_vld(from, 0);
95 }
96 
97 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
98  const float* a = reinterpret_cast<const float*>(&m);
99  Packet4f res =
100  make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
101  return res;
102 }
103 
104 template <bool interleave>
105 EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
106  const float* a = reinterpret_cast<const float*>(&m);
107  const float* b = reinterpret_cast<const float*>(&n);
108  Packet4f res =
109  make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
110  return res;
111 }
112 
113 template <>
114 EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
115  const float* a = reinterpret_cast<const float*>(&m);
116  const float* b = reinterpret_cast<const float*>(&n);
117  Packet4f res =
118  make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
119  return res;
120 }
121 
122 EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) {
123  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
124 }
125 
126 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
127  return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
128 }
129 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
130  return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
131 }
132 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
133  return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
134 }
135 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
136  return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
137 }
138 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
139  return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
140 }
141 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
142  return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
143 }
144 
145 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
146  double from[2] = {a, b};
147  return (Packet2d)__lsx_vld(from, 0);
148 }
149 
150 EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
151  const double* a = reinterpret_cast<const double*>(&m);
152  const double* b = reinterpret_cast<const double*>(&n);
153  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
154  return res;
155 }
156 
157 EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
158  return shuffle(a, b, mask);
159 }
160 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
161 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
162 
163 template <>
164 struct packet_traits<int8_t> : default_packet_traits {
165  typedef Packet16c type;
166  typedef Packet16c half;
167  enum {
168  Vectorizable = 1,
169  AlignedOnScalar = 1,
170  size = 16,
171 
172  HasAbs2 = 0,
173  HasSetLinear = 0,
174  HasCmp = 1,
175  HasBlend = 0
176  };
177 };
178 
179 template <>
180 struct packet_traits<int16_t> : default_packet_traits {
181  typedef Packet8s type;
182  typedef Packet8s half;
183  enum {
184  Vectorizable = 1,
185  AlignedOnScalar = 1,
186  size = 8,
187 
188  HasAbs2 = 0,
189  HasSetLinear = 0,
190  HasCmp = 1,
191  HasDiv = 1,
192  HasBlend = 0
193  };
194 };
195 
196 template <>
197 struct packet_traits<int32_t> : default_packet_traits {
198  typedef Packet4i type;
199  typedef Packet4i half;
200  enum {
201  Vectorizable = 1,
202  AlignedOnScalar = 1,
203  size = 4,
204 
205  HasAbs2 = 0,
206  HasSetLinear = 0,
207  HasCmp = 1,
208  HasDiv = 1,
209  HasBlend = 0
210  };
211 };
212 
213 template <>
214 struct packet_traits<int64_t> : default_packet_traits {
215  typedef Packet2l type;
216  typedef Packet2l half;
217  enum {
218  Vectorizable = 1,
219  AlignedOnScalar = 1,
220  size = 2,
221 
222  HasAbs2 = 0,
223  HasSetLinear = 0,
224  HasCmp = 1,
225  HasDiv = 1,
226  HasBlend = 0
227  };
228 };
229 
230 template <>
231 struct packet_traits<uint8_t> : default_packet_traits {
232  typedef Packet16uc type;
233  typedef Packet16uc half;
234  enum {
235  Vectorizable = 1,
236  AlignedOnScalar = 1,
237  size = 16,
238 
239  HasAbs2 = 0,
240  HasSetLinear = 0,
241  HasNegate = 0,
242  HasCmp = 1,
243  HasBlend = 0
244  };
245 };
246 
247 template <>
248 struct packet_traits<uint16_t> : default_packet_traits {
249  typedef Packet8us type;
250  typedef Packet8us half;
251  enum {
252  Vectorizable = 1,
253  AlignedOnScalar = 1,
254  size = 8,
255 
256  HasAbs2 = 0,
257  HasSetLinear = 0,
258  HasNegate = 0,
259  HasCmp = 1,
260  HasDiv = 1,
261  HasBlend = 0
262  };
263 };
264 
265 template <>
266 struct packet_traits<uint32_t> : default_packet_traits {
267  typedef Packet4ui type;
268  typedef Packet4ui half;
269  enum {
270  Vectorizable = 1,
271  AlignedOnScalar = 1,
272  size = 4,
273 
274  HasAbs2 = 0,
275  HasSetLinear = 0,
276  HasNegate = 0,
277  HasCmp = 1,
278  HasDiv = 1,
279  HasBlend = 0
280  };
281 };
282 
283 template <>
284 struct packet_traits<uint64_t> : default_packet_traits {
285  typedef Packet2ul type;
286  typedef Packet2ul half;
287  enum {
288  Vectorizable = 1,
289  AlignedOnScalar = 1,
290  size = 2,
291 
292  HasAbs2 = 0,
293  HasSetLinear = 0,
294  HasNegate = 0,
295  HasCmp = 1,
296  HasDiv = 1,
297  HasBlend = 0
298  };
299 };
300 
301 template <>
302 struct packet_traits<float> : default_packet_traits {
303  typedef Packet4f type;
304  typedef Packet4f half;
305  enum {
306  Vectorizable = 1,
307  AlignedOnScalar = 1,
308  size = 4,
309 
310  HasAbs2 = 0,
311  HasSetLinear = 0,
312  HasBlend = 0,
313  HasSign = 0,
314  HasDiv = 1,
315  HasExp = 1,
316  HasSqrt = 1,
317  HasLog = 1,
318  HasRsqrt = 1
319  };
320 };
321 
322 template <>
323 struct packet_traits<double> : default_packet_traits {
324  typedef Packet2d type;
325  typedef Packet2d half;
326  enum {
327  Vectorizable = 1,
328  AlignedOnScalar = 1,
329  size = 2,
330 
331  HasAbs2 = 0,
332  HasSetLinear = 0,
333  HasBlend = 0,
334  HasSign = 0,
335  HasDiv = 1,
336  HasSqrt = 1,
337  HasLog = 1,
338  HasRsqrt = 1
339  };
340 };
341 
342 template <>
343 struct unpacket_traits<Packet16c> {
344  typedef int8_t type;
345  typedef Packet16c half;
346  enum {
347  size = 16,
348  alignment = Aligned16,
349  vectorizable = true,
350  masked_load_available = false,
351  masked_store_available = false
352  };
353 };
354 template <>
355 struct unpacket_traits<Packet8s> {
356  typedef int16_t type;
357  typedef Packet8s half;
358  enum {
359  size = 8,
360  alignment = Aligned16,
361  vectorizable = true,
362  masked_load_available = false,
363  masked_store_available = false
364  };
365 };
366 template <>
367 struct unpacket_traits<Packet4i> {
368  typedef int32_t type;
369  typedef Packet4i half;
370  enum {
371  size = 4,
372  alignment = Aligned16,
373  vectorizable = true,
374  masked_load_available = false,
375  masked_store_available = false
376  };
377 };
378 template <>
379 struct unpacket_traits<Packet2l> {
380  typedef int64_t type;
381  typedef Packet2l half;
382  enum {
383  size = 2,
384  alignment = Aligned16,
385  vectorizable = true,
386  masked_load_available = false,
387  masked_store_available = false
388  };
389 };
390 template <>
391 struct unpacket_traits<Packet16uc> {
392  typedef uint8_t type;
393  typedef Packet16uc half;
394  enum {
395  size = 16,
396  alignment = Aligned16,
397  vectorizable = true,
398  masked_load_available = false,
399  masked_store_available = false
400  };
401 };
402 template <>
403 struct unpacket_traits<Packet8us> {
404  typedef uint16_t type;
405  typedef Packet8us half;
406  enum {
407  size = 8,
408  alignment = Aligned16,
409  vectorizable = true,
410  masked_load_available = false,
411  masked_store_available = false
412  };
413 };
414 template <>
415 struct unpacket_traits<Packet4ui> {
416  typedef uint32_t type;
417  typedef Packet4ui half;
418  enum {
419  size = 4,
420  alignment = Aligned16,
421  vectorizable = true,
422  masked_load_available = false,
423  masked_store_available = false
424  };
425 };
426 template <>
427 struct unpacket_traits<Packet2ul> {
428  typedef uint64_t type;
429  typedef Packet2ul half;
430  enum {
431  size = 2,
432  alignment = Aligned16,
433  vectorizable = true,
434  masked_load_available = false,
435  masked_store_available = false
436  };
437 };
438 template <>
439 struct unpacket_traits<Packet4f> {
440  typedef float type;
441  typedef Packet4f half;
442  typedef Packet4i integer_packet;
443  enum {
444  size = 4,
445  alignment = Aligned16,
446  vectorizable = true,
447  masked_load_available = false,
448  masked_store_available = false
449  };
450 };
451 template <>
452 struct unpacket_traits<Packet2d> {
453  typedef double type;
454  typedef Packet2d half;
455  typedef Packet2l integer_packet;
456  enum {
457  size = 2,
458  alignment = Aligned16,
459  vectorizable = true,
460  masked_load_available = false,
461  masked_store_available = false
462  };
463 };
464 
465 template <>
466 EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
467  return __lsx_vreplgr2vr_b(from);
468 }
469 template <>
470 EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
471  return __lsx_vreplgr2vr_h(from);
472 }
473 template <>
474 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
475  return __lsx_vreplgr2vr_w(from);
476 }
477 template <>
478 EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
479  return __lsx_vreplgr2vr_d(from);
480 }
481 template <>
482 EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
483  return __lsx_vreplgr2vr_b(from);
484 }
485 template <>
486 EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
487  return __lsx_vreplgr2vr_h(from);
488 }
489 template <>
490 EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
491  return __lsx_vreplgr2vr_w(from);
492 }
493 template <>
494 EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
495  return __lsx_vreplgr2vr_d(from);
496 }
497 template <>
498 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
499  Packet4f v = {from, from, from, from};
500  return v;
501 }
502 template <>
503 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
504  Packet2d v = {from, from};
505  return v;
506 }
507 
508 template <>
509 EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
510  return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from));
511 }
512 template <>
513 EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
514  return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from));
515 }
516 
517 template <>
518 EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
519  const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
520  return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
521 }
522 template <>
523 EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
524  const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
525  return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
526 }
527 template <>
528 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
529  const int32_t countdown[] = {0, 1, 2, 3};
530  return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
531 }
532 template <>
533 EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
534  const int64_t countdown[] = {0, 1};
535  return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
536 }
537 template <>
538 EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
539  const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
540  return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
541 }
542 template <>
543 EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
544  const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
545  return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
546 }
547 template <>
548 EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
549  const uint32_t countdown[] = {0, 1, 2, 3};
550  return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
551 }
552 template <>
553 EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
554  const uint64_t countdown[] = {0, 1};
555  return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
556 }
557 template <>
558 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
559  static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
560  return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
561 }
562 template <>
563 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
564  static const Packet2d countdown = {0.0f, 1.0f};
565  return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
566 }
567 
568 template <>
569 EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
570  return __lsx_vadd_b(a, b);
571 }
572 template <>
573 EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
574  return __lsx_vadd_h(a, b);
575 }
576 template <>
577 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
578  return __lsx_vadd_w(a, b);
579 }
580 template <>
581 EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
582  return __lsx_vadd_d(a, b);
583 }
584 template <>
585 EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
586  return __lsx_vadd_b(a, b);
587 }
588 template <>
589 EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
590  return __lsx_vadd_h(a, b);
591 }
592 template <>
593 EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
594  return __lsx_vadd_w(a, b);
595 }
596 template <>
597 EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
598  return __lsx_vadd_d(a, b);
599 }
600 template <>
601 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
602  return __lsx_vfadd_s(a, b);
603 }
604 template <>
605 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
606  return __lsx_vfadd_d(a, b);
607 }
608 
609 template <>
610 EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
611  return __lsx_vsub_b(a, b);
612 }
613 template <>
614 EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
615  return __lsx_vsub_h(a, b);
616 }
617 template <>
618 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
619  return __lsx_vsub_w(a, b);
620 }
621 template <>
622 EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
623  return __lsx_vsub_d(a, b);
624 }
625 template <>
626 EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
627  return __lsx_vsub_b(a, b);
628 }
629 template <>
630 EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
631  return __lsx_vsub_h(a, b);
632 }
633 template <>
634 EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
635  return __lsx_vsub_w(a, b);
636 }
637 template <>
638 EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
639  return __lsx_vsub_d(a, b);
640 }
641 template <>
642 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
643  return __lsx_vfsub_s(a, b);
644 }
645 template <>
646 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
647  return __lsx_vfsub_d(a, b);
648 }
649 
650 template <>
651 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
652 template <>
653 EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
654  const Packet4f mask =
655  make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
656  return padd(a, pxor(mask, b));
657 }
658 template <>
659 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b);
660 template <>
661 EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
662  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
663  return padd(a, pxor(mask, b));
664 }
665 
666 template <>
667 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
668  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
669  numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
670  return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
671 }
672 template <>
673 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
674  Packet2d mask =
675  make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
676  return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
677 }
678 template <>
679 EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
680  return __lsx_vneg_b(a);
681 }
682 template <>
683 EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
684  return __lsx_vneg_h(a);
685 }
686 template <>
687 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
688  return __lsx_vneg_w(a);
689 }
690 template <>
691 EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
692  return __lsx_vneg_d(a);
693 }
694 
695 template <>
696 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
697  return a;
698 }
699 template <>
700 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
701  return a;
702 }
703 template <>
704 EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
705  return a;
706 }
707 template <>
708 EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
709  return a;
710 }
711 template <>
712 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
713  return a;
714 }
715 template <>
716 EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
717  return a;
718 }
719 template <>
720 EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
721  return a;
722 }
723 template <>
724 EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
725  return a;
726 }
727 template <>
728 EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
729  return a;
730 }
731 template <>
732 EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
733  return a;
734 }
735 
736 template <>
737 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
738  return __lsx_vfmul_s(a, b);
739 }
740 template <>
741 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
742  return __lsx_vfmul_d(a, b);
743 }
744 template <>
745 EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
746  return __lsx_vmul_b(a, b);
747 }
748 template <>
749 EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
750  return __lsx_vmul_h(a, b);
751 }
752 template <>
753 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
754  return __lsx_vmul_w(a, b);
755 }
756 template <>
757 EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
758  return __lsx_vmul_d(a, b);
759 }
760 template <>
761 EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
762  return __lsx_vmul_b(a, b);
763 }
764 template <>
765 EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
766  return __lsx_vmul_h(a, b);
767 }
768 template <>
769 EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
770  return __lsx_vmul_w(a, b);
771 }
772 template <>
773 EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
774  return __lsx_vmul_d(a, b);
775 }
776 
777 template <>
778 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
779  return __lsx_vfdiv_s(a, b);
780 }
781 template <>
782 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
783  return __lsx_vfdiv_d(a, b);
784 }
785 template <>
786 EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) {
787  return __lsx_vdiv_h(a, b);
788 }
789 template <>
790 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
791  return __lsx_vdiv_w(a, b);
792 }
793 template <>
794 EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) {
795  return __lsx_vdiv_d(a, b);
796 }
797 template <>
798 EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) {
799  return __lsx_vdiv_hu(a, b);
800 }
801 template <>
802 EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
803  return __lsx_vdiv_wu(a, b);
804 }
805 template <>
806 EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
807  return __lsx_vdiv_du(a, b);
808 }
809 
810 template <>
811 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
812  return __lsx_vfmadd_s(a, b, c);
813 }
814 template <>
815 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
816  return __lsx_vfmadd_d(a, b, c);
817 }
818 template <>
819 EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
820  return __lsx_vfmsub_s(a, b, c);
821 }
822 template <>
823 EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
824  return __lsx_vfmsub_d(a, b, c);
825 }
826 template <>
827 EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
828  return __lsx_vfnmsub_s(a, b, c);
829 }
830 template <>
831 EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
832  return __lsx_vfnmsub_d(a, b, c);
833 }
834 template <>
835 EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
836  return __lsx_vfnmadd_s(a, b, c);
837 }
838 template <>
839 EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
840  return __lsx_vfnmadd_d(a, b, c);
841 }
842 template <>
843 EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
844  return __lsx_vmadd_b(c, a, b);
845 }
846 template <>
847 EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
848  return __lsx_vmadd_h(c, a, b);
849 }
850 template <>
851 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
852  return __lsx_vmadd_w(c, a, b);
853 }
854 template <>
855 EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
856  return __lsx_vmadd_d(c, a, b);
857 }
858 template <>
859 EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
860  return __lsx_vmadd_b(c, a, b);
861 }
862 template <>
863 EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
864  return __lsx_vmadd_h(c, a, b);
865 }
866 template <>
867 EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
868  return __lsx_vmadd_w(c, a, b);
869 }
870 template <>
871 EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) {
872  return __lsx_vmadd_d(c, a, b);
873 }
874 
875 template <>
876 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
877  return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
878 }
879 template <>
880 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
881  return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
882 }
883 template <>
884 EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
885  return __lsx_vand_v(a, b);
886 }
887 template <>
888 EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
889  return __lsx_vand_v(a, b);
890 }
891 template <>
892 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
893  return __lsx_vand_v(a, b);
894 }
895 template <>
896 EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
897  return __lsx_vand_v(a, b);
898 }
899 template <>
900 EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
901  return __lsx_vand_v(a, b);
902 }
903 template <>
904 EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
905  return __lsx_vand_v(a, b);
906 }
907 template <>
908 EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
909  return __lsx_vand_v(a, b);
910 }
911 template <>
912 EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
913  return __lsx_vand_v(a, b);
914 }
915 
916 template <>
917 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
918  return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
919 }
920 template <>
921 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
922  return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
923 }
924 template <>
925 EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
926  return __lsx_vor_v(a, b);
927 }
928 template <>
929 EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
930  return __lsx_vor_v(a, b);
931 }
932 template <>
933 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
934  return __lsx_vor_v(a, b);
935 }
936 template <>
937 EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
938  return __lsx_vor_v(a, b);
939 }
940 template <>
941 EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
942  return __lsx_vor_v(a, b);
943 }
944 template <>
945 EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
946  return __lsx_vor_v(a, b);
947 }
948 template <>
949 EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
950  return __lsx_vor_v(a, b);
951 }
952 template <>
953 EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
954  return __lsx_vor_v(a, b);
955 }
956 
957 template <>
958 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
959  return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
960 }
961 template <>
962 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
963  return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
964 }
965 template <>
966 EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
967  return __lsx_vxor_v(a, b);
968 }
969 template <>
970 EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
971  return __lsx_vxor_v(a, b);
972 }
973 template <>
974 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
975  return __lsx_vxor_v(a, b);
976 }
977 template <>
978 EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
979  return __lsx_vxor_v(a, b);
980 }
981 template <>
982 EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
983  return __lsx_vxor_v(a, b);
984 }
985 template <>
986 EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
987  return __lsx_vxor_v(a, b);
988 }
989 template <>
990 EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
991  return __lsx_vxor_v(a, b);
992 }
993 template <>
994 EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
995  return __lsx_vxor_v(a, b);
996 }
997 
998 template <>
999 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1000  return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
1001 }
1002 template <>
1003 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
1004  return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
1005 }
1006 template <>
1007 EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
1008  return __lsx_vandn_v(b, a);
1009 }
1010 template <>
1011 EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
1012  return __lsx_vandn_v(b, a);
1013 }
1014 template <>
1015 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1016  return __lsx_vandn_v(b, a);
1017 }
1018 template <>
1019 EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
1020  return __lsx_vandn_v(b, a);
1021 }
1022 template <>
1023 EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1024  return __lsx_vandn_v(b, a);
1025 }
1026 template <>
1027 EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
1028  return __lsx_vandn_v(b, a);
1029 }
1030 template <>
1031 EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1032  return __lsx_vandn_v(b, a);
1033 }
1034 template <>
1035 EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1036  return __lsx_vandn_v(b, a);
1037 }
1038 
1039 template <>
1040 EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1041  return (Packet4f)__lsx_vfcmp_cle_s(a, b);
1042 }
1043 template <>
1044 EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) {
1045  return (Packet2d)__lsx_vfcmp_cle_d(a, b);
1046 }
1047 template <>
1048 EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
1049  return __lsx_vsle_b(a, b);
1050 }
1051 template <>
1052 EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
1053  return __lsx_vsle_h(a, b);
1054 }
1055 template <>
1056 EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
1057  return __lsx_vsle_w(a, b);
1058 }
1059 template <>
1060 EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
1061  return __lsx_vsle_d(a, b);
1062 }
1063 template <>
1064 EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1065  return __lsx_vsle_bu(a, b);
1066 }
1067 template <>
1068 EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
1069  return __lsx_vsle_hu(a, b);
1070 }
1071 template <>
1072 EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1073  return __lsx_vsle_wu(a, b);
1074 }
1075 template <>
1076 EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1077  return __lsx_vsle_du(a, b);
1078 }
1079 
1080 template <>
1081 EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1082  return (Packet4f)__lsx_vfcmp_clt_s(a, b);
1083 }
1084 template <>
1085 EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) {
1086  return (Packet2d)__lsx_vfcmp_clt_d(a, b);
1087 }
1088 template <>
1089 EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
1090  return __lsx_vslt_b(a, b);
1091 }
1092 template <>
1093 EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
1094  return __lsx_vslt_h(a, b);
1095 }
1096 template <>
1097 EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
1098  return __lsx_vslt_w(a, b);
1099 }
1100 template <>
1101 EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
1102  return __lsx_vslt_d(a, b);
1103 }
1104 template <>
1105 EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1106  return __lsx_vslt_bu(a, b);
1107 }
1108 template <>
1109 EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
1110  return __lsx_vslt_hu(a, b);
1111 }
1112 template <>
1113 EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1114  return __lsx_vslt_wu(a, b);
1115 }
1116 template <>
1117 EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1118  return __lsx_vslt_du(a, b);
1119 }
1120 
1121 template <>
1122 EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
1123  return (Packet4f)__lsx_vfcmp_sult_s(a, b);
1124 }
1125 template <>
1126 EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
1127  return (Packet2d)__lsx_vfcmp_sult_d(a, b);
1128 }
1129 
1130 template <>
1131 EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1132  return (Packet4f)__lsx_vfcmp_seq_s(a, b);
1133 }
1134 template <>
1135 EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) {
1136  return (Packet2d)__lsx_vfcmp_seq_d(a, b);
1137 }
1138 template <>
1139 EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
1140  return __lsx_vseq_b(a, b);
1141 }
1142 template <>
1143 EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
1144  return __lsx_vseq_h(a, b);
1145 }
1146 template <>
1147 EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
1148  return __lsx_vseq_w(a, b);
1149 }
1150 template <>
1151 EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
1152  return __lsx_vseq_d(a, b);
1153 }
1154 template <>
1155 EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1156  return __lsx_vseq_b(a, b);
1157 }
1158 template <>
1159 EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
1160  return __lsx_vseq_h(a, b);
1161 }
1162 template <>
1163 EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1164  return __lsx_vseq_w(a, b);
1165 }
1166 template <>
1167 EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1168  return __lsx_vseq_d(a, b);
1169 }
1170 
1171 template <>
1172 EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1173  return __lsx_vmin_b(a, b);
1174 }
1175 template <>
1176 EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1177  return __lsx_vmin_h(a, b);
1178 }
1179 template <>
1180 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1181  return __lsx_vmin_w(a, b);
1182 }
1183 template <>
1184 EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1185  return __lsx_vmin_d(a, b);
1186 }
1187 template <>
1188 EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1189  return __lsx_vmin_bu(a, b);
1190 }
1191 template <>
1192 EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1193  return __lsx_vmin_hu(a, b);
1194 }
1195 template <>
1196 EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1197  return __lsx_vmin_wu(a, b);
1198 }
1199 template <>
1200 EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1201  return __lsx_vmin_du(a, b);
1202 }
1203 
1204 template <>
1205 EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1206  return __lsx_vmax_b(a, b);
1207 }
1208 template <>
1209 EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1210  return __lsx_vmax_h(a, b);
1211 }
1212 template <>
1213 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1214  return __lsx_vmax_w(a, b);
1215 }
1216 template <>
1217 EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1218  return __lsx_vmax_d(a, b);
1219 }
1220 template <>
1221 EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1222  return __lsx_vmax_bu(a, b);
1223 }
1224 template <>
1225 EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1226  return __lsx_vmax_hu(a, b);
1227 }
1228 template <>
1229 EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1230  return __lsx_vmax_wu(a, b);
1231 }
1232 template <>
1233 EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1234  return __lsx_vmax_du(a, b);
1235 }
1236 
1237 template <>
1238 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1239  Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1240  Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
1241  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1242 }
1243 template <>
1244 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
1245  Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1246  Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
1247  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1248 }
1249 template <>
1250 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1251  Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1252  Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
1253  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1254 }
1255 template <>
1256 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
1257  Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1258  Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
1259  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1260 }
1261 
1262 template <int N>
1263 EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) {
1264  return __lsx_vsrai_b((__m128i)a, N);
1265 }
1266 template <int N>
1267 EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) {
1268  return __lsx_vsrai_h((__m128i)a, N);
1269 }
1270 template <int N>
1271 EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1272  return __lsx_vsrai_w((__m128i)a, N);
1273 }
1274 template <int N>
1275 EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
1276  return __lsx_vsrai_d((__m128i)a, N);
1277 }
1278 template <int N>
1279 EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) {
1280  return __lsx_vsrli_b((__m128i)a, N);
1281 }
1282 template <int N>
1283 EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) {
1284  return __lsx_vsrli_h((__m128i)a, N);
1285 }
1286 template <int N>
1287 EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
1288  return __lsx_vsrli_w((__m128i)a, N);
1289 }
1290 template <int N>
1291 EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) {
1292  return __lsx_vsrli_d((__m128i)a, N);
1293 }
1294 
1295 template <int N>
1296 EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) {
1297  return __lsx_vsrli_b((__m128i)a, N);
1298 }
1299 template <int N>
1300 EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) {
1301  return __lsx_vsrli_h((__m128i)a, N);
1302 }
1303 template <int N>
1304 EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1305  return __lsx_vsrli_w((__m128i)a, N);
1306 }
1307 template <int N>
1308 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
1309  return __lsx_vsrli_d((__m128i)a, N);
1310 }
1311 template <int N>
1312 EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) {
1313  return __lsx_vsrli_b((__m128i)a, N);
1314 }
1315 template <int N>
1316 EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1317  return __lsx_vsrli_h((__m128i)a, N);
1318 }
1319 template <int N>
1320 EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1321  return __lsx_vsrli_w((__m128i)a, N);
1322 }
1323 template <int N>
1324 EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) {
1325  return __lsx_vsrli_d((__m128i)a, N);
1326 }
1327 
1328 template <int N>
1329 EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) {
1330  return __lsx_vslli_b((__m128i)a, N);
1331 }
1332 template <int N>
1333 EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) {
1334  return __lsx_vslli_h((__m128i)a, N);
1335 }
1336 template <int N>
1337 EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1338  return __lsx_vslli_w((__m128i)a, N);
1339 }
1340 template <int N>
1341 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
1342  return __lsx_vslli_d((__m128i)a, N);
1343 }
1344 template <int N>
1345 EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) {
1346  return __lsx_vslli_b((__m128i)a, N);
1347 }
1348 template <int N>
1349 EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1350  return __lsx_vslli_h((__m128i)a, N);
1351 }
1352 template <int N>
1353 EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1354  return __lsx_vslli_w((__m128i)a, N);
1355 }
1356 template <int N>
1357 EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) {
1358  return __lsx_vslli_d((__m128i)a, N);
1359 }
1360 
1361 template <>
1362 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1363  return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
1364 }
1365 template <>
1366 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1367  return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
1368 }
1369 template <>
1370 EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1371  return __lsx_vabsd_b(a, pzero(a));
1372 }
1373 template <>
1374 EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1375  return __lsx_vabsd_h(a, pzero(a));
1376 }
1377 template <>
1378 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1379  return __lsx_vabsd_w(a, pzero(a));
1380 }
1381 template <>
1382 EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
1383  return __lsx_vabsd_d(a, pzero(a));
1384 }
1385 template <>
1386 EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1387  return a;
1388 }
1389 template <>
1390 EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1391  return a;
1392 }
1393 template <>
1394 EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
1395  return a;
1396 }
1397 template <>
1398 EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
1399  return a;
1400 }
1401 
1402 template <>
1403 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1404  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
1405 }
1406 template <>
1407 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1408  EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
1409 }
1410 template <>
1411 EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
1412  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1413 }
1414 template <>
1415 EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
1416  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1417 }
1418 template <>
1419 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
1420  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1421 }
1422 template <>
1423 EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
1424  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1425 }
1426 template <>
1427 EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
1428  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1429 }
1430 template <>
1431 EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
1432  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1433 }
1434 template <>
1435 EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
1436  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1437 }
1438 template <>
1439 EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
1440  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1441 }
1442 
1443 template <>
1444 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1445  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
1446 }
1447 template <>
1448 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1449  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
1450 }
1451 template <>
1452 EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
1453  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1454 }
1455 template <>
1456 EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
1457  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1458 }
1459 template <>
1460 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
1461  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1462 }
1463 template <>
1464 EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
1465  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1466 }
1467 template <>
1468 EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
1469  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1470 }
1471 template <>
1472 EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
1473  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1474 }
1475 template <>
1476 EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
1477  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1478 }
1479 template <>
1480 EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
1481  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1482 }
1483 
1484 template <>
1485 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1486  float f0 = from[0], f1 = from[1];
1487  return make_packet4f(f0, f0, f1, f1);
1488 }
1489 template <>
1490 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1491  return pset1<Packet2d>(from[0]);
1492 }
1493 template <>
1494 EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
1495  Packet16c tmp = pload<Packet16c>(from);
1496  return __lsx_vilvl_b(tmp, tmp);
1497 }
1498 template <>
1499 EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
1500  Packet8s tmp = pload<Packet8s>(from);
1501  return __lsx_vilvl_h(tmp, tmp);
1502 }
1503 template <>
1504 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
1505  Packet4i tmp = pload<Packet4i>(from);
1506  return __lsx_vilvl_w(tmp, tmp);
1507 }
1508 template <>
1509 EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
1510  return pset1<Packet2l>(from[0]);
1511 }
1512 template <>
1513 EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
1514  Packet16uc tmp = pload<Packet16uc>(from);
1515  return __lsx_vilvl_b(tmp, tmp);
1516 }
1517 template <>
1518 EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
1519  Packet8us tmp = pload<Packet8us>(from);
1520  return __lsx_vilvl_h(tmp, tmp);
1521 }
1522 template <>
1523 EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
1524  Packet4ui tmp = pload<Packet4ui>(from);
1525  return __lsx_vilvl_w(tmp, tmp);
1526 }
1527 template <>
1528 EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
1529  return pset1<Packet2ul>(from[0]);
1530 }
1531 
1532 template <>
1533 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1534  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1535 }
1536 template <>
1537 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1538  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1539 }
1540 template <>
1541 EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
1542  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1543 }
1544 template <>
1545 EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
1546  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1547 }
1548 template <>
1549 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
1550  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1551 }
1552 template <>
1553 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1554  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1555 }
1556 template <>
1557 EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
1558  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1559 }
1560 template <>
1561 EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
1562  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1563 }
1564 template <>
1565 EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1566  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1567 }
1568 template <>
1569 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
1570  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1571 }
1572 
1573 template <>
1574 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1575  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1576 }
1577 template <>
1578 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1579  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1580 }
1581 
1582 template <>
1583 EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
1584  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1585 }
1586 template <>
1587 EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
1588  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1589 }
1590 template <>
1591 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
1592  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1593 }
1594 template <>
1595 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1596  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1597 }
1598 template <>
1599 EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
1600  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1601 }
1602 template <>
1603 EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
1604  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1605 }
1606 template <>
1607 EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
1608  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1609 }
1610 template <>
1611 EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
1612  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1613 }
1614 
1615 template <>
1616 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1617  Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
1618  return v;
1619 }
1620 template <>
1621 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1622  Packet2d v = {from[0], from[stride]};
1623  return v;
1624 }
1625 template <>
1626 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
1627  int8_t v[16] __attribute__((aligned(16)));
1628  v[0] = from[0];
1629  v[1] = from[stride];
1630  v[2] = from[2 * stride];
1631  v[3] = from[3 * stride];
1632  v[4] = from[4 * stride];
1633  v[5] = from[5 * stride];
1634  v[6] = from[6 * stride];
1635  v[7] = from[7 * stride];
1636  v[8] = from[8 * stride];
1637  v[9] = from[9 * stride];
1638  v[10] = from[10 * stride];
1639  v[11] = from[11 * stride];
1640  v[12] = from[12 * stride];
1641  v[13] = from[13 * stride];
1642  v[14] = from[14 * stride];
1643  v[15] = from[15 * stride];
1644  return __lsx_vld(v, 0);
1645 }
1646 template <>
1647 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
1648  int16_t v[8] __attribute__((aligned(16)));
1649  v[0] = from[0];
1650  v[1] = from[stride];
1651  v[2] = from[2 * stride];
1652  v[3] = from[3 * stride];
1653  v[4] = from[4 * stride];
1654  v[5] = from[5 * stride];
1655  v[6] = from[6 * stride];
1656  v[7] = from[7 * stride];
1657  return __lsx_vld(v, 0);
1658 }
1659 template <>
1660 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
1661  int32_t v[4] __attribute__((aligned(16)));
1662  v[0] = from[0];
1663  v[1] = from[stride];
1664  v[2] = from[2 * stride];
1665  v[3] = from[3 * stride];
1666  return __lsx_vld(v, 0);
1667 }
1668 template <>
1669 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
1670  int64_t v[2] __attribute__((aligned(16)));
1671  v[0] = from[0];
1672  v[1] = from[stride];
1673  return __lsx_vld(v, 0);
1674 }
1675 template <>
1676 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
1677  uint8_t v[16] __attribute__((aligned(16)));
1678  v[0] = from[0];
1679  v[1] = from[stride];
1680  v[2] = from[2 * stride];
1681  v[3] = from[3 * stride];
1682  v[4] = from[4 * stride];
1683  v[5] = from[5 * stride];
1684  v[6] = from[6 * stride];
1685  v[7] = from[7 * stride];
1686  v[8] = from[8 * stride];
1687  v[9] = from[9 * stride];
1688  v[10] = from[10 * stride];
1689  v[11] = from[11 * stride];
1690  v[12] = from[12 * stride];
1691  v[13] = from[13 * stride];
1692  v[14] = from[14 * stride];
1693  v[15] = from[15 * stride];
1694  return __lsx_vld(v, 0);
1695 }
1696 template <>
1697 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
1698  uint16_t v[8] __attribute__((aligned(16)));
1699  v[0] = from[0];
1700  v[1] = from[stride];
1701  v[2] = from[2 * stride];
1702  v[3] = from[3 * stride];
1703  v[4] = from[4 * stride];
1704  v[5] = from[5 * stride];
1705  v[6] = from[6 * stride];
1706  v[7] = from[7 * stride];
1707  return __lsx_vld(v, 0);
1708 }
1709 template <>
1710 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
1711  uint32_t v[4] __attribute__((aligned(16)));
1712  v[0] = from[0];
1713  v[1] = from[stride];
1714  v[2] = from[2 * stride];
1715  v[3] = from[3 * stride];
1716  return __lsx_vld(v, 0);
1717 }
1718 template <>
1719 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
1720  uint64_t v[2] __attribute__((aligned(16)));
1721  v[0] = from[0];
1722  v[1] = from[stride];
1723  return __lsx_vld(v, 0);
1724 }
1725 
1726 template <>
1727 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1728  __lsx_vstelm_w(from, to, 0, 0);
1729  __lsx_vstelm_w(from, to + stride * 1, 0, 1);
1730  __lsx_vstelm_w(from, to + stride * 2, 0, 2);
1731  __lsx_vstelm_w(from, to + stride * 3, 0, 3);
1732 }
1733 template <>
1734 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1735  __lsx_vstelm_d(from, to, 0, 0);
1736  __lsx_vstelm_d(from, to + stride, 0, 1);
1737 }
1738 template <>
1739 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
1740  Index stride) {
1741  __lsx_vstelm_b((__m128i)from, to, 0, 0);
1742  __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1743  __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1744  __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1745  __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1746  __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1747  __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1748  __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1749  __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1750  __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1751  __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1752  __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1753  __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1754  __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1755  __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1756  __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1757 }
1758 template <>
1759 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
1760  Index stride) {
1761  __lsx_vstelm_h((__m128i)from, to, 0, 0);
1762  __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1763  __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1764  __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1765  __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1766  __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1767  __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1768  __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1769 }
1770 template <>
1771 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
1772  Index stride) {
1773  __lsx_vstelm_w((__m128i)from, to, 0, 0);
1774  __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1775  __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1776  __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1777 }
1778 template <>
1779 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
1780  Index stride) {
1781  __lsx_vstelm_d((__m128i)from, to, 0, 0);
1782  __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1783 }
1784 template <>
1785 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
1786  Index stride) {
1787  __lsx_vstelm_b((__m128i)from, to, 0, 0);
1788  __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1789  __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1790  __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1791  __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1792  __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1793  __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1794  __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1795  __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1796  __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1797  __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1798  __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1799  __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1800  __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1801  __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1802  __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1803 }
1804 template <>
1805 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
1806  Index stride) {
1807  __lsx_vstelm_h((__m128i)from, to, 0, 0);
1808  __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1809  __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1810  __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1811  __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1812  __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1813  __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1814  __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1815 }
1816 template <>
1817 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
1818  Index stride) {
1819  __lsx_vstelm_w((__m128i)from, to, 0, 0);
1820  __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1821  __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1822  __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1823 }
1824 template <>
1825 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
1826  Index stride) {
1827  __lsx_vstelm_d((__m128i)from, to, 0, 0);
1828  __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1829 }
1830 
1831 template <>
1832 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1833  __builtin_prefetch(addr);
1834 }
1835 template <>
1836 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1837  __builtin_prefetch(addr);
1838 }
1839 template <>
1840 EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
1841  __builtin_prefetch(addr);
1842 }
1843 template <>
1844 EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
1845  __builtin_prefetch(addr);
1846 }
1847 template <>
1848 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
1849  __builtin_prefetch(addr);
1850 }
1851 template <>
1852 EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1853  __builtin_prefetch(addr);
1854 }
1855 template <>
1856 EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
1857  __builtin_prefetch(addr);
1858 }
1859 template <>
1860 EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
1861  __builtin_prefetch(addr);
1862 }
1863 template <>
1864 EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1865  __builtin_prefetch(addr);
1866 }
1867 template <>
1868 EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
1869  __builtin_prefetch(addr);
1870 }
1871 
1872 template <>
1873 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1874  float v;
1875  __lsx_vstelm_w(a, &v, 0, 0);
1876  return v;
1877 }
1878 template <>
1879 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1880  double v;
1881  __lsx_vstelm_d(a, &v, 0, 0);
1882  return v;
1883 }
1884 
1885 template <>
1886 EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
1887  return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
1888 }
1889 template <>
1890 EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
1891  return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
1892 }
1893 template <>
1894 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
1895  return __lsx_vpickve2gr_w((__m128i)a, 0);
1896 }
1897 template <>
1898 EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1899  return __lsx_vpickve2gr_d((__m128i)a, 0);
1900 }
1901 template <>
1902 EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
1903  return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
1904 }
1905 template <>
1906 EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
1907  return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
1908 }
1909 template <>
1910 EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1911  return __lsx_vpickve2gr_wu((__m128i)a, 0);
1912 }
1913 template <>
1914 EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
1915  return __lsx_vpickve2gr_du((__m128i)a, 0);
1916 }
1917 
1918 template <>
1919 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1920  return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
1921 }
1922 template <>
1923 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1924  return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
1925 }
1926 template <>
1927 EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1928  return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1929 }
1930 template <>
1931 EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1932  return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1933 }
1934 template <>
1935 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1936  return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1937 }
1938 template <>
1939 EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
1940  return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1941 }
1942 template <>
1943 EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1944  return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1945 }
1946 template <>
1947 EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1948  return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1949 }
1950 template <>
1951 EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
1952  return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1953 }
1954 template <>
1955 EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
1956  return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1957 }
1958 
1959 template <>
1960 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1961  Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1962  return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1963 }
1964 template <>
1965 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
1966  return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
1967 }
1968 template <>
1969 EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
1970  Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
1971  Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
1972  Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
1973  return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
1974 }
1975 template <>
1976 EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
1977  Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
1978  Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
1979  return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
1980 }
1981 template <>
1982 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
1983  Packet2l tmp = __lsx_vhaddw_d_w(a, a);
1984  return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
1985 }
1986 template <>
1987 EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
1988  return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
1989 }
1990 template <>
1991 EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
1992  Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
1993  Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
1994  Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
1995  return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
1996 }
1997 template <>
1998 EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
1999  Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
2000  Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
2001  return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
2002 }
2003 template <>
2004 EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
2005  Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
2006  return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
2007 }
2008 template <>
2009 EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
2010  return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
2011 }
2012 
2013 template <>
2014 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
2015  Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
2016  return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
2017 }
2018 template <>
2019 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
2020  return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
2021 }
2022 template <>
2023 EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
2024  Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
2025  Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2026  Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2027  return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2028 }
2029 template <>
2030 EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
2031  Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
2032  Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2033  return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2034 }
2035 template <>
2036 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
2037  Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
2038  return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2039 }
2040 template <>
2041 EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
2042  return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
2043 }
2044 template <>
2045 EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
2046  Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
2047  Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2048  Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2049  return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2050 }
2051 template <>
2052 EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
2053  Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
2054  Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2055  return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2056 }
2057 template <>
2058 EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
2059  Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
2060  return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2061 }
2062 template <>
2063 EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
2064  return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
2065 }
2066 
2067 template <>
2068 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2069  Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2070  return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2071 }
2072 template <>
2073 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
2074  return pfirst(__lsx_vfmin_d(a, preverse(a)));
2075 }
2076 template <>
2077 EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
2078  Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2079  Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2080  Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2081  return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2082 }
2083 template <>
2084 EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
2085  Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2086  Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2087  return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2088 }
2089 template <>
2090 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
2091  Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2092  return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2093 }
2094 template <>
2095 EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
2096  return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
2097 }
2098 template <>
2099 EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
2100  Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2101  Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2102  Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2103  return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2104 }
2105 template <>
2106 EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
2107  Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2108  Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2109  return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2110 }
2111 template <>
2112 EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
2113  Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2114  return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2115 }
2116 template <>
2117 EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
2118  return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
2119 }
2120 
2121 template <>
2122 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2123  Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2124  return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2125 }
2126 template <>
2127 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
2128  return pfirst(__lsx_vfmax_d(a, preverse(a)));
2129 }
2130 template <>
2131 EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
2132  Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2133  Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2134  Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2135  return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2136 }
2137 template <>
2138 EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
2139  Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2140  Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2141  return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2142 }
2143 template <>
2144 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
2145  Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2146  return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2147 }
2148 template <>
2149 EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
2150  return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
2151 }
2152 template <>
2153 EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
2154  Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2155  Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2156  Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2157  return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2158 }
2159 template <>
2160 EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
2161  Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2162  Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2163  return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2164 }
2165 template <>
2166 EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
2167  Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2168  return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2169 }
2170 template <>
2171 EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
2172  return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
2173 }
2174 
2175 template <>
2176 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
2177  return __lsx_vfsqrt_s(a);
2178 }
2179 template <>
2180 EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
2181  return __lsx_vfsqrt_d(a);
2182 }
2183 
2184 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2185  Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2186  Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2187  Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2188  Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2189 
2190  kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
2191  kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
2192  kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
2193  kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
2194 }
2195 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2196  Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2197  kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2198  kernel.packet[1] = tmp;
2199 }
2200 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2201  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2202  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2203  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2204  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2205  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2206  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2207  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2208  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2209  __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2210  __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2211  __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2212  __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2213  __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2214  __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2215  __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2216  __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2217 
2218  __m128i s0 = __lsx_vilvl_h(t2, t0);
2219  __m128i s1 = __lsx_vilvh_h(t2, t0);
2220  __m128i s2 = __lsx_vilvl_h(t3, t1);
2221  __m128i s3 = __lsx_vilvh_h(t3, t1);
2222  __m128i s4 = __lsx_vilvl_h(t6, t4);
2223  __m128i s5 = __lsx_vilvh_h(t6, t4);
2224  __m128i s6 = __lsx_vilvl_h(t7, t5);
2225  __m128i s7 = __lsx_vilvh_h(t7, t5);
2226  __m128i s8 = __lsx_vilvl_h(ta, t8);
2227  __m128i s9 = __lsx_vilvh_h(ta, t8);
2228  __m128i sa = __lsx_vilvl_h(tb, t9);
2229  __m128i sb = __lsx_vilvh_h(tb, t9);
2230  __m128i sc = __lsx_vilvl_h(te, tc);
2231  __m128i sd = __lsx_vilvh_h(te, tc);
2232  __m128i se = __lsx_vilvl_h(tf, td);
2233  __m128i sf = __lsx_vilvh_h(tf, td);
2234 
2235  __m128i u0 = __lsx_vilvl_w(s4, s0);
2236  __m128i u1 = __lsx_vilvh_w(s4, s0);
2237  __m128i u2 = __lsx_vilvl_w(s5, s1);
2238  __m128i u3 = __lsx_vilvh_w(s5, s1);
2239  __m128i u4 = __lsx_vilvl_w(s6, s2);
2240  __m128i u5 = __lsx_vilvh_w(s6, s2);
2241  __m128i u6 = __lsx_vilvl_w(s7, s3);
2242  __m128i u7 = __lsx_vilvh_w(s7, s3);
2243  __m128i u8 = __lsx_vilvl_w(sc, s8);
2244  __m128i u9 = __lsx_vilvh_w(sc, s8);
2245  __m128i ua = __lsx_vilvl_w(sd, s9);
2246  __m128i ub = __lsx_vilvh_w(sd, s9);
2247  __m128i uc = __lsx_vilvl_w(se, sa);
2248  __m128i ud = __lsx_vilvh_w(se, sa);
2249  __m128i ue = __lsx_vilvl_w(sf, sb);
2250  __m128i uf = __lsx_vilvh_w(sf, sb);
2251 
2252  kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2253  kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2254  kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2255  kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2256  kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2257  kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2258  kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2259  kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2260  kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2261  kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2262  kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2263  kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2264  kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2265  kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2266  kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2267  kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2268 }
2269 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2270  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2271  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2272  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2273  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2274  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2275  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2276  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2277  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2278 
2279  __m128i s0 = __lsx_vilvl_h(t2, t0);
2280  __m128i s1 = __lsx_vilvh_h(t2, t0);
2281  __m128i s2 = __lsx_vilvl_h(t3, t1);
2282  __m128i s3 = __lsx_vilvh_h(t3, t1);
2283  __m128i s4 = __lsx_vilvl_h(t6, t4);
2284  __m128i s5 = __lsx_vilvh_h(t6, t4);
2285  __m128i s6 = __lsx_vilvl_h(t7, t5);
2286  __m128i s7 = __lsx_vilvh_h(t7, t5);
2287 
2288  kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2289  kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2290  kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2291  kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2292  kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2293  kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2294  kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2295  kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2296 }
2297 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2298  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2299  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2300  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2301  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2302 
2303  kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2304  kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2305  kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2306  kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2307 }
2308 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2309  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2310  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2311  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2312  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2313  __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2314  __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2315  __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2316  __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2317 
2318  __m128i s0 = __lsx_vilvl_w(t2, t0);
2319  __m128i s1 = __lsx_vilvh_w(t2, t0);
2320  __m128i s2 = __lsx_vilvl_w(t3, t1);
2321  __m128i s3 = __lsx_vilvh_w(t3, t1);
2322  __m128i s4 = __lsx_vilvl_w(t6, t4);
2323  __m128i s5 = __lsx_vilvh_w(t6, t4);
2324  __m128i s6 = __lsx_vilvl_w(t7, t5);
2325  __m128i s7 = __lsx_vilvh_w(t7, t5);
2326 
2327  kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2328  kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2329  kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2330  kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2331  kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2332  kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2333  kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2334  kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2335 }
2336 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2337  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2338  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2339  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2340  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2341 
2342  kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2343  kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2344  kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2345  kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2346 }
2347 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2348  __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2349  __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2350  __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2351  __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2352 
2353  kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2354  kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2355  kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2356  kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2357 }
2358 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
2359  __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2360  kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2361  kernel.packet[1] = tmp;
2362 }
2363 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2364  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2365  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2366  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2367  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2368  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2369  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2370  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2371  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2372  __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2373  __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2374  __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2375  __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2376  __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2377  __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2378  __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2379  __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2380 
2381  __m128i s0 = __lsx_vilvl_h(t2, t0);
2382  __m128i s1 = __lsx_vilvh_h(t2, t0);
2383  __m128i s2 = __lsx_vilvl_h(t3, t1);
2384  __m128i s3 = __lsx_vilvh_h(t3, t1);
2385  __m128i s4 = __lsx_vilvl_h(t6, t4);
2386  __m128i s5 = __lsx_vilvh_h(t6, t4);
2387  __m128i s6 = __lsx_vilvl_h(t7, t5);
2388  __m128i s7 = __lsx_vilvh_h(t7, t5);
2389  __m128i s8 = __lsx_vilvl_h(ta, t8);
2390  __m128i s9 = __lsx_vilvh_h(ta, t8);
2391  __m128i sa = __lsx_vilvl_h(tb, t9);
2392  __m128i sb = __lsx_vilvh_h(tb, t9);
2393  __m128i sc = __lsx_vilvl_h(te, tc);
2394  __m128i sd = __lsx_vilvh_h(te, tc);
2395  __m128i se = __lsx_vilvl_h(tf, td);
2396  __m128i sf = __lsx_vilvh_h(tf, td);
2397 
2398  __m128i u0 = __lsx_vilvl_w(s4, s0);
2399  __m128i u1 = __lsx_vilvh_w(s4, s0);
2400  __m128i u2 = __lsx_vilvl_w(s5, s1);
2401  __m128i u3 = __lsx_vilvh_w(s5, s1);
2402  __m128i u4 = __lsx_vilvl_w(s6, s2);
2403  __m128i u5 = __lsx_vilvh_w(s6, s2);
2404  __m128i u6 = __lsx_vilvl_w(s7, s3);
2405  __m128i u7 = __lsx_vilvh_w(s7, s3);
2406  __m128i u8 = __lsx_vilvl_w(sc, s8);
2407  __m128i u9 = __lsx_vilvh_w(sc, s8);
2408  __m128i ua = __lsx_vilvl_w(sd, s9);
2409  __m128i ub = __lsx_vilvh_w(sd, s9);
2410  __m128i uc = __lsx_vilvl_w(se, sa);
2411  __m128i ud = __lsx_vilvh_w(se, sa);
2412  __m128i ue = __lsx_vilvl_w(sf, sb);
2413  __m128i uf = __lsx_vilvh_w(sf, sb);
2414 
2415  kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2416  kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2417  kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2418  kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2419  kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2420  kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2421  kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2422  kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2423  kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2424  kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2425  kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2426  kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2427  kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2428  kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2429  kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2430  kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2431 }
2432 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2433  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2434  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2435  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2436  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2437  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2438  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2439  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2440  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2441 
2442  __m128i s0 = __lsx_vilvl_h(t2, t0);
2443  __m128i s1 = __lsx_vilvh_h(t2, t0);
2444  __m128i s2 = __lsx_vilvl_h(t3, t1);
2445  __m128i s3 = __lsx_vilvh_h(t3, t1);
2446  __m128i s4 = __lsx_vilvl_h(t6, t4);
2447  __m128i s5 = __lsx_vilvh_h(t6, t4);
2448  __m128i s6 = __lsx_vilvl_h(t7, t5);
2449  __m128i s7 = __lsx_vilvh_h(t7, t5);
2450 
2451  kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2452  kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2453  kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2454  kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2455  kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2456  kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2457  kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2458  kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2459 }
2460 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2461  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2462  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2463  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2464  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2465 
2466  kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2467  kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2468  kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2469  kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2470 }
2471 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2472  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2473  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2474  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2475  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2476  __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2477  __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2478  __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2479  __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2480 
2481  __m128i s0 = __lsx_vilvl_w(t2, t0);
2482  __m128i s1 = __lsx_vilvh_w(t2, t0);
2483  __m128i s2 = __lsx_vilvl_w(t3, t1);
2484  __m128i s3 = __lsx_vilvh_w(t3, t1);
2485  __m128i s4 = __lsx_vilvl_w(t6, t4);
2486  __m128i s5 = __lsx_vilvh_w(t6, t4);
2487  __m128i s6 = __lsx_vilvl_w(t7, t5);
2488  __m128i s7 = __lsx_vilvh_w(t7, t5);
2489 
2490  kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2491  kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2492  kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2493  kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2494  kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2495  kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2496  kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2497  kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2498 }
2499 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2500  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2501  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2502  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2503  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2504 
2505  kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2506  kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2507  kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2508  kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2509 }
2510 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2511  __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2512  __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2513  __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2514  __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2515 
2516  kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2517  kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2518  kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2519  kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2520 }
2521 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
2522  __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2523  kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2524  kernel.packet[1] = tmp;
2525 }
2526 
2527 template <>
2528 EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
2529  return __lsx_vfrsqrt_s(a);
2530 }
2531 template <>
2532 EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
2533  return __lsx_vfrsqrt_d(a);
2534 }
2535 
2536 template <>
2537 EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) {
2538  return __lsx_vfrintrm_s(a);
2539 }
2540 template <>
2541 EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) {
2542  return __lsx_vfrintrm_d(a);
2543 }
2544 
2545 template <>
2546 EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) {
2547  return __lsx_vfrintrp_s(a);
2548 }
2549 template <>
2550 EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) {
2551  return __lsx_vfrintrp_d(a);
2552 }
2553 
2554 template <>
2555 EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) {
2556  const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u));
2557  const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
2558  return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
2559 }
2560 template <>
2561 EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) {
2562  const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
2563  const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
2564  return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
2565 }
2566 
2567 template <>
2568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
2569  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2570 }
2571 template <>
2572 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
2573  return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2574 }
2575 
2576 template <>
2577 EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
2578  int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2579  *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2580  *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2581  return __lsx_vld(tmp, 0);
2582 }
2583 template <>
2584 EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
2585  uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2586  *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2587  *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2588  return __lsx_vld(tmp, 0);
2589 }
2590 template <>
2591 EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
2592  int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2593  return __lsx_vld(tmp, 0);
2594 }
2595 template <>
2596 EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
2597  uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2598  return __lsx_vld(tmp, 0);
2599 }
2600 template <>
2601 EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
2602  int32_t tmp[4] = {*from, *from, *from, *from};
2603  return __lsx_vld(tmp, 0);
2604 }
2605 template <>
2606 EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
2607  uint32_t tmp[4] = {*from, *from, *from, *from};
2608  return __lsx_vld(tmp, 0);
2609 }
2610 
2611 template <>
2612 EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2613  return __lsx_vmsub_b(pnegate(c), a, b);
2614 }
2615 template <>
2616 EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2617  return __lsx_vmsub_h(pnegate(c), a, b);
2618 }
2619 template <>
2620 EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2621  return __lsx_vmsub_w(pnegate(c), a, b);
2622 }
2623 template <>
2624 EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2625  return __lsx_vmsub_d(pnegate(c), a, b);
2626 }
2627 
2628 template <>
2629 EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2630  return __lsx_vmadd_b(pnegate(c), a, b);
2631 }
2632 template <>
2633 EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2634  return __lsx_vmadd_h(pnegate(c), a, b);
2635 }
2636 template <>
2637 EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2638  return __lsx_vmadd_w(pnegate(c), a, b);
2639 }
2640 template <>
2641 EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2642  return __lsx_vmadd_d(pnegate(c), a, b);
2643 }
2644 
2645 template <>
2646 EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2647  return __lsx_vmsub_b(c, a, b);
2648 }
2649 template <>
2650 EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2651  return __lsx_vmsub_h(c, a, b);
2652 }
2653 template <>
2654 EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2655  return __lsx_vmsub_w(c, a, b);
2656 }
2657 template <>
2658 EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2659  return __lsx_vmsub_d(c, a, b);
2660 }
2661 
2662 template <>
2663 EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) {
2664  return pexp_float(_x);
2665 }
2666 template <>
2667 EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) {
2668  return pexp_double(_x);
2669 }
2670 
2671 template <>
2672 EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2673  return pldexp_generic(a, exponent);
2674 }
2675 
2676 template <>
2677 EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
2678  return pfrexp_generic(a, exponent);
2679 }
2680 template <>
2681 EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2682  return pfrexp_generic(a, exponent);
2683 }
2684 template <>
2685 EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
2686  Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
2687  return v;
2688 }
2689 template <>
2690 EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
2691  Packet4f v = psub(a, b);
2692  return pabs(v);
2693 }
2694 template <>
2695 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
2696  return pmin<Packet4f>(a, b);
2697 }
2698 template <>
2699 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
2700  return pmax<Packet4f>(a, b);
2701 }
2702 template <>
2703 EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
2704  return (__m128)__lsx_vldrepl_w(from, 0);
2705 }
2706 template <>
2707 EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
2708  return (__m128)__lsx_vsrai_w((__m128i)a, 31);
2709 }
2710 template <>
2711 EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
2712  return __lsx_vfrintrne_s(a);
2713 }
2714 template <>
2715 EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
2716  return __lsx_vfrintrz_s(a);
2717 }
2718 template <>
2719 EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
2720  return __lsx_vfrecip_s(a);
2721 }
2722 
2723 template <>
2724 EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) {
2725  Packet2d v = {0.0, 0.0};
2726  return v;
2727 }
2728 template <>
2729 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
2730  return pmin<Packet2d>(a, b);
2731 }
2732 template <>
2733 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
2734  return pmax<Packet2d>(a, b);
2735 }
2736 template <>
2737 EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
2738  return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
2739 }
2740 template <>
2741 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
2742  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2743 }
2744 template <>
2745 EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
2746  return __lsx_vfrintrne_d(a);
2747 }
2748 template <>
2749 EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
2750  return __lsx_vfrintrz_d(a);
2751 }
2752 template <>
2753 EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2754  return pldexp_generic(a, exponent);
2755 }
2756 
2757 template <>
2758 EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
2759  Packet16c v = psub(a, b);
2760  return pabs(v);
2761 }
2762 
2763 template <>
2764 EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
2765  Packet8s v = psub(a, b);
2766  return pabs(v);
2767 }
2768 template <>
2769 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
2770  return __lsx_vbitsel_v(b, a, mask);
2771 }
2772 
2773 template <>
2774 EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
2775  Packet4i v = psub(a, b);
2776  return pabs(v);
2777 }
2778 template <>
2779 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
2780  return __lsx_vbitsel_v(b, a, mask);
2781 }
2782 
2783 template <>
2784 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
2785  return __lsx_vbitsel_v(b, a, mask);
2786 }
2787 
2788 template <>
2789 EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2790  return __lsx_vdiv_bu(a, b);
2791 }
2792 template <>
2793 EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2794  Packet16uc v = psub(a, b);
2795  return pabs(v);
2796 }
2797 template <>
2798 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
2799  const Packet16uc& b) {
2800  return __lsx_vbitsel_v(b, a, mask);
2801 }
2802 template <>
2803 EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
2804  __m128i res = {0, 0};
2805  __m128i add = {0x0808080808080808, 0x0808080808080808};
2806  for (int i = 0; i < 4; i++) {
2807  const __m128i temp = __lsx_vor_v(res, add);
2808  const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
2809  res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
2810  add = __lsx_vsrli_b(add, 1);
2811  }
2812  return res;
2813 }
2814 
2815 template <>
2816 EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
2817  Packet8us v = psub(a, b);
2818  return pabs(v);
2819 }
2820 template <>
2821 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
2822  return __lsx_vbitsel_v(b, a, mask);
2823 }
2824 template <>
2825 EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
2826  __m128i res = {0, 0};
2827  __m128i add = {0x0080008000800080, 0x0080008000800080};
2828  for (int i = 0; i < 4; i++) {
2829  const __m128i temp = __lsx_vor_v(res, add);
2830  const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
2831  res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
2832  add = __lsx_vsrli_h(add, 1);
2833  }
2834  return res;
2835 }
2836 
2837 template <>
2838 EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
2839  Packet4ui v = psub(a, b);
2840  return pabs(v);
2841 }
2842 template <>
2843 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
2844  return __lsx_vbitsel_v(b, a, mask);
2845 }
2846 template <>
2847 EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
2848  __m128i res = {0, 0};
2849  __m128i add = {0x0000800000008000, 0x0000800000008000};
2850  for (int i = 0; i < 4; i++) {
2851  const __m128i temp = __lsx_vor_v(res, add);
2852  const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
2853  res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
2854  add = __lsx_vsrli_w(add, 1);
2855  }
2856  return res;
2857 }
2858 
2859 template <>
2860 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
2861  return __lsx_vbitsel_v(b, a, mask);
2862 }
2863 
2864 } // namespace internal
2865 } // namespace Eigen
2866 #endif
Definition: Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82