$darkmode
Eigen  5.0.1-dev
PacketMath.h
1 
2 #ifndef EIGEN_HVX_PACKET_MATH_H
3 #define EIGEN_HVX_PACKET_MATH_H
4 
5 // Only support 128B HVX now.
6 // Floating-point operations are supported only since V68.
7 #if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
8 
9 // All the floating-point operations do not support IEEE standard.
10 // From HVX document:
11 // There is no concept of infinity or NaN. QFloat saturates to maximum
12 // exponent with maximum positive or minimum negative significand.
13 
14 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
16 #endif
17 
18 namespace Eigen {
19 namespace internal {
20 
21 // HVX utilities.
22 
23 template <int D>
24 EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
25  HVX_Vector v;
26 #if EIGEN_COMP_CLANG
27  // Use inlined assembly for aligned vmem load on unaligned memory.
28  // Use type cast to HVX_Vector* may mess up with compiler data alignment.
29  __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
30 #else
31  void* aligned_mem =
32  reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33  memcpy(&v, aligned_mem, __HVX_LENGTH__);
34 #endif
35  return v;
36 }
37 
38 template <typename T>
39 EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
40  HVX_Vector v;
41  memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
42  return v;
43 }
44 
45 template <typename T>
46 EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
47  HVX_Vector v;
48  memcpy(&v, mem, __HVX_LENGTH__);
49  return v;
50 }
51 
52 template <size_t Size, size_t Alignment, typename T>
53 EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
54 #if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
55  // Fast partial vector load through aligned vmem load.
56  // The load may past end of array but is aligned to prevent memory fault.
57  HVX_Vector v0 = HVX_vmem<0>(mem);
58  HVX_Vector v1 = v0;
59  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
60  EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
61  // Data size less than alignment will never cross multiple aligned vectors.
62  v1 = v0;
63  }
64  else {
65  uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66  if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
67  v1 = HVX_vmem<1>(mem);
68  } else {
69  v1 = v0;
70  }
71  }
72  return Q6_V_valign_VVR(v1, v0, mem_addr);
73 #else
74  HVX_Vector v;
75  memcpy(&v, mem, Size * sizeof(T));
76  return v;
77 #endif
78 }
79 
80 template <typename T>
81 EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
82  memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
83 }
84 
85 template <typename T>
86 EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
87  memcpy(mem, &v, __HVX_LENGTH__);
88 }
89 
90 template <size_t Size, size_t Alignment, typename T>
91 EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
92  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
93  HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94  uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95  uintptr_t right_off = left_off + Size * sizeof(T);
96 
97  HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98  HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
99 
100  EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
101  if (right_off > __HVX_LENGTH__) {
102  Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
103  qr = Q6_Q_vcmp_eq_VbVb(value, value);
104  }
105  }
106 
107  ql_not = Q6_Q_or_QQn(ql_not, qr);
108  Q6_vmem_QnRIV(ql_not, mem, value);
109 }
110 
111 // Packet definitions.
112 enum class HVXPacketSize {
113  Full,
114  Half,
115  Quarter,
116 };
117 
118 // Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
119 // Wrap different vector type (float32, int32, etc) to different class with
120 // explicit constructor and casting back-and-force to HVX_Vector.
121 template <HVXPacketSize T>
122 class HVXPacket {
123  public:
124  HVXPacket() = default;
125  static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
126  HVX_Vector Get() const { return m_val; }
127 
128  private:
129  explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130  HVX_Vector m_val = Q6_V_vzero();
131 };
132 
133 typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134 typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135 typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
136 
137 // Packet traits.
138 template <>
139 struct packet_traits<float> : default_packet_traits {
140  typedef Packet32f type;
141  typedef Packet16f half;
142  enum {
143  Vectorizable = 1,
144  AlignedOnScalar = 1,
145  size = 32,
146 
147  HasCmp = 1,
148  HasAdd = 1,
149  HasSub = 1,
150  HasShift = 0,
151  HasMul = 1,
152  HasNegate = 1,
153  HasAbs = 1,
154  HasArg = 0,
155  HasAbs2 = 0,
156  HasAbsDiff = 0,
157  HasMin = 1,
158  HasMax = 1,
159  HasConj = 0,
160  HasSetLinear = 0,
161  HasBlend = 0,
162 
163  HasDiv = 0,
164 
165  HasSin = 0,
166  HasCos = 0,
167  HasACos = 0,
168  HasASin = 0,
169  HasATan = 0,
170  HasATanh = 0,
171  HasLog = 0,
172  HasExp = 0,
173  HasSqrt = 0,
174  HasRsqrt = 0,
175  HasTanh = 0,
176  HasErf = 0,
177  HasBessel = 0,
178  HasNdtri = 0
179  };
180 };
181 
182 template <>
183 struct unpacket_traits<Packet32f> {
184  typedef float type;
185  typedef Packet16f half;
186  enum {
187  size = 32,
188  alignment = Aligned128,
189  vectorizable = true,
190  masked_load_available = false,
191  masked_store_available = false
192  };
193 };
194 
195 template <>
196 struct unpacket_traits<Packet16f> {
197  typedef float type;
198  typedef Packet8f half;
199  enum {
200  size = 16,
201  // Many code assume alignment on packet size instead of following trait
202  // So we do not use Aligned128 to optimize aligned load/store,
203  alignment = Aligned64,
204  vectorizable = true,
205  masked_load_available = false,
206  masked_store_available = false
207  };
208 };
209 
210 template <>
211 struct unpacket_traits<Packet8f> {
212  typedef float type;
213  typedef Packet8f half;
214  enum {
215  size = 8,
216  // Many code assume alignment on packet size instead of following trait
217  // So we do not use Aligned128 to optimize aligned load/store,
218  alignment = Aligned32,
219  vectorizable = true,
220  masked_load_available = false,
221  masked_store_available = false
222  };
223 };
224 
225 // float32 operations.
226 template <HVXPacketSize T>
227 EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
228  return HVXPacket<T>::Create(Q6_V_vzero());
229 }
230 template <>
231 EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
232  return pzero_hvx(Packet32f());
233 }
234 template <>
235 EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
236  return pzero_hvx(Packet16f());
237 }
238 template <>
239 EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
240  return pzero_hvx(Packet8f());
241 }
242 
243 template <HVXPacketSize T>
244 EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
245  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
246  return unpacket_traits<HVXPacket<T>>::half::Create(
247  Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
248 }
249 template <>
250 EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
251  return predux_half_dowto4_hvx(a);
252 }
253 template <>
254 EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
255  return predux_half_dowto4_hvx(a);
256 }
257 
258 template <HVXPacketSize T>
259 EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
260  union {
261  float f;
262  int32_t i;
263  } u;
264  u.f = from;
265  return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
266 }
267 template <>
268 EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
269  return pset1_hvx<HVXPacketSize::Full>(from);
270 }
271 template <>
272 EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
273  return pset1_hvx<HVXPacketSize::Half>(from);
274 }
275 template <>
276 EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
277  return pset1_hvx<HVXPacketSize::Quarter>(from);
278 }
279 
280 template <>
281 EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
282  return Packet32f::Create(HVX_load(from));
283 }
284 template <>
285 EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
286  return Packet16f::Create(
287  HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
288 }
289 template <>
290 EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
291  return Packet8f::Create(
292  HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
293 }
294 
295 template <>
296 EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
297  return Packet32f::Create(HVX_loadu(from));
298 }
299 template <>
300 EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
301  return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
302 }
303 template <>
304 EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
305  return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
306 }
307 
308 template <>
309 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
310  HVX_store(to, from.Get());
311 }
312 template <>
313 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
314  HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
315 }
316 template <>
317 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
318  HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
319 }
320 
321 template <>
322 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
323  HVX_storeu(to, from.Get());
324 }
325 template <>
326 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
327  HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
328 }
329 template <>
330 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
331  HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
332 }
333 
334 template <HVXPacketSize T>
335 EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
336  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
337 }
338 template <>
339 EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
340  return pmul_hvx(a, b);
341 }
342 template <>
343 EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
344  return pmul_hvx(a, b);
345 }
346 template <>
347 EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
348  return pmul_hvx(a, b);
349 }
350 
351 template <HVXPacketSize T>
352 EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
353  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
354 }
355 template <>
356 EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
357  return padd_hvx(a, b);
358 }
359 template <>
360 EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
361  return padd_hvx(a, b);
362 }
363 template <>
364 EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
365  return padd_hvx(a, b);
366 }
367 
368 template <HVXPacketSize T>
369 EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
370  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
371 }
372 template <>
373 EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
374  return psub_hvx(a, b);
375 }
376 template <>
377 EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
378  return psub_hvx(a, b);
379 }
380 template <>
381 EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
382  return psub_hvx(a, b);
383 }
384 
385 template <HVXPacketSize T>
386 EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
387  return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
388 }
389 template <>
390 EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
391  return pnegate_hvx(a);
392 }
393 template <>
394 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
395  return pnegate_hvx(a);
396 }
397 template <>
398 EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
399  return pnegate_hvx(a);
400 }
401 
402 template <HVXPacketSize T>
403 EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
404  return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
405 }
406 template <>
407 EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
408  return ptrue_hvx(a);
409 }
410 template <>
411 EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
412  return ptrue_hvx(a);
413 }
414 template <>
415 EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
416  return ptrue_hvx(a);
417 }
418 
419 template <HVXPacketSize T>
420 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
421  HVX_Vector v_true = ptrue(a).Get();
422  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
423  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
424 }
425 template <>
426 EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
427  return pcmp_le_hvx(a, b);
428 }
429 template <>
430 EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
431  return pcmp_le_hvx(a, b);
432 }
433 template <>
434 EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
435  return pcmp_le_hvx(a, b);
436 }
437 
438 template <HVXPacketSize T>
439 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
440  HVX_Vector v_true = ptrue(a).Get();
441  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
442  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
443 }
444 template <>
445 EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
446  return pcmp_eq_hvx(a, b);
447 }
448 template <>
449 EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
450  return pcmp_eq_hvx(a, b);
451 }
452 template <>
453 EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
454  return pcmp_eq_hvx(a, b);
455 }
456 
457 template <HVXPacketSize T>
458 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
459  HVX_Vector v_true = ptrue(a).Get();
460  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
461  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
462 }
463 template <>
464 EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
465  return pcmp_lt_hvx(a, b);
466 }
467 template <>
468 EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
469  return pcmp_lt_hvx(a, b);
470 }
471 template <>
472 EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
473  return pcmp_lt_hvx(a, b);
474 }
475 
476 template <HVXPacketSize T>
477 EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
478  HVX_Vector v_true = ptrue(a).Get();
479  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
480  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
481 }
482 template <>
483 EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
484  return pcmp_lt_or_nan_hvx(a, b);
485 }
486 template <>
487 EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
488  return pcmp_lt_or_nan_hvx(a, b);
489 }
490 template <>
491 EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
492  return pcmp_lt_or_nan_hvx(a, b);
493 }
494 
495 template <HVXPacketSize T>
496 EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
497  return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
498 }
499 template <>
500 EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
501  return pabs_hvx(a);
502 }
503 template <>
504 EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
505  return pabs_hvx(a);
506 }
507 template <>
508 EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
509  return pabs_hvx(a);
510 }
511 
512 template <HVXPacketSize T>
513 EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
514  union {
515  float array[1];
516  HVX_Vector vector;
517  } HVX_and_array;
518  HVX_and_array.vector = a.Get();
519  return HVX_and_array.array[0];
520 }
521 template <>
522 EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
523  return pfirst_hvx(a);
524 }
525 template <>
526 EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
527  return pfirst_hvx(a);
528 }
529 template <>
530 EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
531  return pfirst_hvx(a);
532 }
533 
534 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
535  // Shuffle the 32-bit lanes.
536  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
537  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
538 
539  // Shuffle the 64-bit lanes.
540  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
541  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
542  kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
543  kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
544  kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
545  kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
546 }
547 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
548  // Shuffle the 32-bit lanes.
549  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
550  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
551 
552  // Shuffle the 64-bit lanes.
553  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
554 
555  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
556  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
557  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
558  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
559 }
560 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
561  // Shuffle the 32-bit lanes.
562  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
563  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
564 
565  // Shuffle the 64-bit lanes.
566  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
567 
568  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
569  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
570  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
571  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
572 }
573 
574 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
575  // Shuffle the 32-bit lanes.
576  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
577  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
578  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
579  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
580 
581  // Shuffle the 64-bit lanes.
582  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
583  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
584 
585  // Shuffle the 128-bit lanes.
586  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
587 
588  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
589  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
590  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
591  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
592  kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
593  kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
594  kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
595  kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
596 }
597 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
598  // Shuffle the 32-bit lanes.
599  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
600  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
601  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
602  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
603  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
604  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
605  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
606  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
607 
608  // Shuffle the 64-bit lanes.
609  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
610  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
611  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
612  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
613 
614  // Shuffle the 128-bit lanes.
615  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
616  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
617  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
618  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
619 
620  // Shuffle the 256-bit lanes.
621  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
622  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
623  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
624  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
625 
626  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
627  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
628  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
629  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
630  kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
631  kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
632  kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
633  kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
634  kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
635  kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
636  kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
637  kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
638  kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
639  kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
640  kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
641  kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
642 }
643 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
644  // Shuffle the 32-bit lanes.
645  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
646  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
647  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
648  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
649  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
650  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
651  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
652  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
653  HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
654  HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
655  HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
656  HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
657  HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
658  HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
659  HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
660  HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
661 
662  // Shuffle the 64-bit lanes.
663  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
664  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
665  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
666  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
667  HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
668  HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
669  HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
670  HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
671  HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
672  HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
673  HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
674  HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
675  HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
676  HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
677  HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
678  HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
679 
680  // Shuffle the 128-bit lanes.
681  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
682  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
683  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
684  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
685  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
686  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
687  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
688  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
689  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
690  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
691  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
692  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
693  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
694  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
695  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
696  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
697 
698  // Shuffle the 256-bit lanes.
699  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
700  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
701  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
702  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
703  v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
704  v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
705  v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
706  v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
707  v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
708  v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
709  v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
710  v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
711  v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
712  v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
713  v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
714  v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
715 
716  // Shuffle the 512-bit lanes.
717  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
718  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
719  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
720  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
721  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
722  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
723  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
724  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
725  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
726  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
727  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
728  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
729  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
730  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
731  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
732  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
733 
734  kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
735  kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
736  kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
737  kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
738  kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
739  kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
740  kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
741  kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
742  kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
743  kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
744  kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
745  kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
746  kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
747  kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
748  kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
749  kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
750  kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
751  kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
752  kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
753  kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
754  kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
755  kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
756  kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
757  kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
758  kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
759  kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
760  kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
761  kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
762  kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
763  kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
764  kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
765  kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
766 }
767 
768 template <HVXPacketSize T>
769 EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
770  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
771  HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
772  for (int i = 2; i < packet_size; i <<= 1) {
773  vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
774  }
775  return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
776 }
777 template <>
778 EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
779  return predux_hvx(a);
780 }
781 template <>
782 EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
783  return predux_hvx(a);
784 }
785 template <>
786 EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
787  return predux_hvx(a);
788 }
789 
790 template <HVXPacketSize T>
791 EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
792  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
793  HVX_Vector load = HVX_load_partial<size, 0>(from);
794  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
795  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
796 }
797 template <>
798 EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
799  return ploaddup_hvx<HVXPacketSize::Full>(from);
800 }
801 template <>
802 EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
803  return ploaddup_hvx<HVXPacketSize::Half>(from);
804 }
805 template <>
806 EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
807  return ploaddup_hvx<HVXPacketSize::Quarter>(from);
808 }
809 
810 template <HVXPacketSize T>
811 EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
812  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
813  HVX_Vector load = HVX_load_partial<size, 0>(from);
814  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
815  HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
816  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
817 }
818 template <>
819 EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
820  return ploadquad_hvx<HVXPacketSize::Full>(from);
821 }
822 template <>
823 EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
824  return ploadquad_hvx<HVXPacketSize::Half>(from);
825 }
826 template <>
827 EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
828  return ploadquad_hvx<HVXPacketSize::Quarter>(from);
829 }
830 
831 template <>
832 EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
833  HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
834  return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
835 }
836 
837 template <>
838 EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
839  HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
840  return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
841 }
842 
843 template <>
844 EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
845  HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
846  return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
847 }
848 
849 template <HVXPacketSize T>
850 EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
851  return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
852 }
853 template <>
854 EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
855  return pmin_hvx(a, b);
856 }
857 template <>
858 EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
859  return pmin_hvx(a, b);
860 }
861 template <>
862 EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
863  return pmin_hvx(a, b);
864 }
865 
866 template <HVXPacketSize T>
867 EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
868  return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
869 }
870 template <>
871 EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
872  return pmax_hvx(a, b);
873 }
874 template <>
875 EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
876  return pmax_hvx(a, b);
877 }
878 template <>
879 EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
880  return pmax_hvx(a, b);
881 }
882 
883 template <HVXPacketSize T>
884 EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
885  return HVXPacket<T>::Create(a.Get() & b.Get());
886 }
887 template <>
888 EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
889  return pand_hvx(a, b);
890 }
891 template <>
892 EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
893  return pand_hvx(a, b);
894 }
895 template <>
896 EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
897  return pand_hvx(a, b);
898 }
899 
900 template <HVXPacketSize T>
901 EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
902  return HVXPacket<T>::Create(a.Get() | b.Get());
903 }
904 template <>
905 EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
906  return por_hvx(a, b);
907 }
908 template <>
909 EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
910  return por_hvx(a, b);
911 }
912 template <>
913 EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
914  return por_hvx(a, b);
915 }
916 
917 template <HVXPacketSize T>
918 EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
919  return HVXPacket<T>::Create(a.Get() ^ b.Get());
920 }
921 template <>
922 EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
923  return pxor_hvx(a, b);
924 }
925 template <>
926 EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
927  return pxor_hvx(a, b);
928 }
929 template <>
930 EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
931  return pxor_hvx(a, b);
932 }
933 
934 template <HVXPacketSize T>
935 EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
936  return HVXPacket<T>::Create(~a.Get());
937 }
938 template <>
939 EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
940  return pnot_hvx(a);
941 }
942 template <>
943 EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
944  return pnot_hvx(a);
945 }
946 template <>
947 EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
948  return pnot_hvx(a);
949 }
950 
951 template <HVXPacketSize T>
952 EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
953  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
954  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
955 }
956 template <>
957 EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
958  return pselect_hvx(mask, a, b);
959 }
960 template <>
961 EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
962  return pselect_hvx(mask, a, b);
963 }
964 template <>
965 EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
966  return pselect_hvx(mask, a, b);
967 }
968 
969 template <HVXPacketSize T, typename Op>
970 EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
971  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
972  HVXPacket<T> vredux = a;
973  for (int i = 1; i < packet_size; i <<= 1) {
974  vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
975  }
976  return pfirst(vredux);
977 }
978 
979 template <>
980 EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
981  return predux_generic(a, pmax<Packet32f>);
982 }
983 template <>
984 EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
985  return predux_generic(a, pmax<Packet16f>);
986 }
987 template <>
988 EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
989  return predux_generic(a, pmax<Packet8f>);
990 }
991 
992 template <>
993 EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
994  return predux_generic(a, pmin<Packet32f>);
995 }
996 template <>
997 EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
998  return predux_generic(a, pmin<Packet16f>);
999 }
1000 template <>
1001 EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
1002  return predux_generic(a, pmin<Packet8f>);
1003 }
1004 
1005 template <>
1006 EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
1007  return predux_generic(a, por<Packet32f>) != 0.0f;
1008 }
1009 template <>
1010 EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
1011  return predux_generic(a, por<Packet16f>) != 0.0f;
1012 }
1013 template <>
1014 EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
1015  return predux_generic(a, por<Packet8f>) != 0.0f;
1016 }
1017 
1018 static const float index_vsf[32]
1019  __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1020  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1021 
1022 template <HVXPacketSize T>
1023 EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
1024  return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1025 }
1026 template <>
1027 EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
1028  return plset_hvx<HVXPacketSize::Full>(a);
1029 }
1030 template <>
1031 EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
1032  return plset_hvx<HVXPacketSize::Half>(a);
1033 }
1034 template <>
1035 EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
1036  return plset_hvx<HVXPacketSize::Quarter>(a);
1037 }
1038 
1039 template <HVXPacketSize T>
1040 EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
1041  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1042  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1043  pstore<float>(elements, from);
1044  for (Index i = 0; i < packet_size; ++i) {
1045  to[i * stride] = elements[i];
1046  }
1047 }
1048 template <>
1049 EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
1050  pscatter_hvx(to, from, stride);
1051 }
1052 template <>
1053 EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
1054  pscatter_hvx(to, from, stride);
1055 }
1056 template <>
1057 EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1058  pscatter_hvx(to, from, stride);
1059 }
1060 
1061 template <HVXPacketSize T>
1062 EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
1063  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1064  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1065  for (Index i = 0; i < packet_size; i++) {
1066  elements[i] = from[i * stride];
1067  }
1068  return pload<HVXPacket<T>>(elements);
1069 }
1070 template <>
1071 EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
1072  return pgather_hvx<HVXPacketSize::Full>(from, stride);
1073 }
1074 template <>
1075 EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
1076  return pgather_hvx<HVXPacketSize::Half>(from, stride);
1077 }
1078 template <>
1079 EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1080  return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
1081 }
1082 
1083 } // end namespace internal
1084 } // end namespace Eigen
1085 
1086 #endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
1087 
1088 #endif // EIGEN_HVX_PACKET_MATH_H
Definition: Constants.h:240
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
Definition: Constants.h:238
Definition: Constants.h:239
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82