$darkmode
Eigen-unsupported  5.0.1-dev
TensorReduction.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13 
14 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
15 // so we'll use a macro to make clang happy.
16 #ifndef KERNEL_FRIEND
17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
19 #else
20 #define KERNEL_FRIEND friend
21 #endif
22 #endif
23 
24 // IWYU pragma: private
25 #include "./InternalHeaderCheck.h"
26 
27 namespace Eigen {
28 
29 namespace internal {
30 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
31 struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > : traits<XprType> {
32  typedef traits<XprType> XprTraits;
33  typedef typename XprTraits::Scalar Scalar;
34  typedef typename XprTraits::StorageKind StorageKind;
35  typedef typename XprTraits::Index Index;
36  typedef typename XprType::Nested Nested;
37  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
38  static constexpr int Layout = XprTraits::Layout;
39  typedef typename XprTraits::PointerType PointerType;
40 
41  template <class T>
42  struct MakePointer {
43  // Intermediate typedef to workaround MSVC issue.
44  typedef MakePointer_<T> MakePointerT;
45  typedef typename MakePointerT::Type Type;
46  };
47 };
48 
49 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
50 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> {
51  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
52 };
53 
54 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
55 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1,
56  typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> {
57  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
58 };
59 
60 template <typename OutputDims>
61 struct DimInitializer {
62  template <typename InputDims, typename ReducedDims>
63  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims,
64  const array<bool, internal::array_size<InputDims>::value>& reduced,
65  OutputDims* output_dims, ReducedDims* reduced_dims) {
66  const int NumInputDims = internal::array_size<InputDims>::value;
67  int outputIndex = 0;
68  int reduceIndex = 0;
69  for (int i = 0; i < NumInputDims; ++i) {
70  if (reduced[i]) {
71  (*reduced_dims)[reduceIndex] = input_dims[i];
72  ++reduceIndex;
73  } else {
74  (*output_dims)[outputIndex] = input_dims[i];
75  ++outputIndex;
76  }
77  }
78  }
79 };
80 
81 template <>
82 struct DimInitializer<Sizes<> > {
83  template <typename InputDims, typename Index, size_t Rank>
84  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array<bool, Rank>&, Sizes<>*,
85  array<Index, Rank>* reduced_dims) {
86  const int NumInputDims = internal::array_size<InputDims>::value;
87  for (int i = 0; i < NumInputDims; ++i) {
88  (*reduced_dims)[i] = input_dims[i];
89  }
90  }
91 };
92 
93 template <typename ReducedDims, int NumTensorDims, int Layout>
94 struct are_inner_most_dims {
95  static const bool value = false;
96 };
97 template <typename ReducedDims, int NumTensorDims, int Layout>
98 struct preserve_inner_most_dims {
99  static const bool value = false;
100 };
101 
102 template <typename ReducedDims, int NumTensorDims>
103 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
104  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
105  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
106  static const bool tmp3 =
107  index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, array_size<ReducedDims>::value - 1);
108  static const bool value = tmp1 & tmp2 & tmp3;
109 };
110 template <typename ReducedDims, int NumTensorDims>
111 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
112  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
113  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
114  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
115  static const bool value = tmp1 & tmp2 & tmp3;
116 };
117 template <typename ReducedDims, int NumTensorDims>
118 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
119  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
120  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
121  static const bool value = tmp1 & tmp2;
122 };
123 template <typename ReducedDims, int NumTensorDims>
124 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
125  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
126  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
127  static const bool value = tmp1 & tmp2;
128 };
129 
130 template <int DimIndex, typename Self, typename Op>
131 struct GenericDimReducer {
132  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
133  Op& reducer, typename Self::CoeffReturnType* accum) {
134  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
135  for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
136  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
137  GenericDimReducer<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
138  }
139  }
140 };
141 template <typename Self, typename Op>
142 struct GenericDimReducer<0, Self, Op> {
143  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
144  Op& reducer, typename Self::CoeffReturnType* accum) {
145  for (int j = 0; j < self.m_reducedDims[0]; ++j) {
146  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
147  reducer.reduce(self.m_impl.coeff(input), accum);
148  }
149  }
150 };
151 template <typename Self, typename Op>
152 struct GenericDimReducer<-1, Self, Op> {
153  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer,
154  typename Self::CoeffReturnType* accum) {
155  reducer.reduce(self.m_impl.coeff(index), accum);
156  }
157 };
158 
159 template <typename Self, typename Op,
160  bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
161  bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
162  // GPU threads can quickly run out of stack space
163  // for moderately sized inputs.
164  !Self::RunningOnGPU)>
165 struct InnerMostDimReducer {
166  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
167  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
168  typename Self::CoeffReturnType accum = reducer.initialize();
169  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
170  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
171  }
172  return reducer.finalize(accum);
173  }
174 };
175 
176 template <typename Self, typename Op>
177 struct InnerMostDimReducer<Self, Op, true, false> {
178  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
179  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
180  using Index = typename Self::Index;
181  constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
182  Index start = 0;
183  typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
184  if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
185  const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
186  typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
187  typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
188  typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
189  const Index offset0 = firstIndex;
190  const Index offset1 = firstIndex + packetSize;
191  const Index offset2 = firstIndex + 2 * packetSize;
192  const Index offset3 = firstIndex + 3 * packetSize;
193  for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) {
194  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
195  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
196  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
197  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
198  }
199  reducer0.reducePacket(paccum1, &paccum0);
200  reducer0.reducePacket(paccum2, &paccum0);
201  reducer0.reducePacket(paccum3, &paccum0);
202  start = VectorizedSize4;
203  }
204  if (start <= (numValuesToReduce - packetSize)) {
205  const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
206  for (Index j = start; j < VectorizedSize; j += packetSize) {
207  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
208  }
209  start = VectorizedSize;
210  }
211  typename Self::CoeffReturnType accum = reducer0.initialize();
212  for (Index j = start; j < numValuesToReduce; ++j) {
213  reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
214  }
215  return reducer0.finalizeBoth(accum, paccum0);
216  }
217 };
218 
219 #if !defined(EIGEN_HIPCC)
220 
221 // The following implements tree-based reduction, which improves the accuracy
222 // of sum and mean reductions, since each of the n inputs only participates in
223 // O(log n) additions.
224 template <typename T>
225 EIGEN_DEVICE_FUNC inline Index LeafSize() {
226  return 1024;
227 }
228 template <>
229 EIGEN_DEVICE_FUNC inline Index LeafSize<half>() {
230  return 200;
231 }
232 template <>
233 EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() {
234  return 128;
235 }
236 
237 template <typename Self, typename Op>
238 struct InnerMostDimReducer<Self, Op, false, true> {
239  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
240  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
241  const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
242  typename Self::CoeffReturnType accum = reducer.initialize();
243  if (numValuesToReduce > kLeafSize) {
244  const typename Self::Index half = numValuesToReduce / 2;
245  // Recursively reduce the two halves.
246  reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
247  reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum);
248  return reducer.finalize(accum);
249  } else {
250  return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
251  }
252  }
253 };
254 
255 template <typename Self, typename Op>
256 struct InnerMostDimReducer<Self, Op, true, true> {
257  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
258  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
259  const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
260  const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
261  typename Self::CoeffReturnType accum = reducer.initialize();
262  if (numValuesToReduce > packetSize * kLeafSize) {
263  // Make sure the split point is aligned on a packet boundary.
264  const typename Self::Index split =
265  packetSize *
266  numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize);
267  const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce);
268  reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
269  if (num_left < numValuesToReduce) {
270  reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
271  }
272  return reducer.finalize(accum);
273  } else {
274  return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
275  }
276  }
277 };
278 #endif
279 
280 template <int DimIndex, typename Self, typename Op,
281  bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
282 struct InnerMostDimPreserver {
283  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
284  typename Self::PacketReturnType*) {
285  eigen_assert(false && "should never be called");
286  }
287 };
288 
289 template <int DimIndex, typename Self, typename Op>
290 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
291  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
292  Op& reducer, typename Self::PacketReturnType* accum) {
293  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
294  for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
295  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
296  InnerMostDimPreserver<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
297  }
298  }
299 };
300 
301 template <typename Self, typename Op>
302 struct InnerMostDimPreserver<0, Self, Op, true> {
303  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
304  Op& reducer0, typename Self::PacketReturnType* accum0) {
305  using Index = typename Self::Index;
306  const Index stride = self.m_reducedStrides[0];
307  const Index size = self.m_reducedDims[0];
308  if (!Self::ReducerTraits::IsStateful && size >= 16) {
309  const Index unrolled_size4 = (size / 4) * 4;
310  typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
311  typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
312  typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
313  for (Index j = 0; j < unrolled_size4; j += 4) {
314  const Index input0 = firstIndex + j * stride;
315  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
316  const Index input1 = firstIndex + (j + 1) * stride;
317  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
318  const Index input2 = firstIndex + (j + 2) * stride;
319  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
320  const Index input3 = firstIndex + (j + 3) * stride;
321  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
322  }
323  reducer0.reducePacket(accum1, accum0);
324  reducer0.reducePacket(accum2, accum0);
325  reducer0.reducePacket(accum3, accum0);
326  for (Index j = unrolled_size4; j < size; ++j) {
327  Index input = firstIndex + j * stride;
328  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
329  }
330  } else {
331  for (Index j = 0; j < size; ++j) {
332  Index input = firstIndex + j * stride;
333  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
334  }
335  }
336  }
337 };
338 template <typename Self, typename Op>
339 struct InnerMostDimPreserver<-1, Self, Op, true> {
340  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
341  typename Self::PacketReturnType*) {
342  eigen_assert(false && "should never be called");
343  }
344 };
345 
346 // Default full reducer
347 template <typename Self, typename Op, typename Device,
348  bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
349 struct FullReducer {
350  static constexpr bool HasOptimizedImplementation = false;
351 
352  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&,
353  typename Self::EvaluatorPointerType output) {
354  const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
355  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
356  }
357 };
358 
359 #ifdef EIGEN_USE_THREADS
360 // Multithreaded full reducer
361 template <typename Self, typename Op, bool Vectorizable>
362 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
363  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
364  static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;
365 
366  // launch one reducer per thread and accumulate the result.
367  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
368  typename Self::CoeffReturnType* output) {
369  typedef typename Self::Index Index;
370  const Index num_coeffs = array_prod(self.m_impl.dimensions());
371  if (num_coeffs == 0) {
372  *output = reducer.finalize(reducer.initialize());
373  return;
374  }
375  const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
376  TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
377  const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
378  if (num_threads == 1) {
379  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
380  return;
381  }
382  const Index blocksize = num_coeffs / num_threads;
383  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
384  eigen_assert(num_coeffs >= numblocks * blocksize);
385 
386  Barrier barrier(internal::convert_index<unsigned int>(numblocks));
387  MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
388  for (Index i = 0; i < numblocks; ++i) {
389  auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){
390  shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer);
391  barrier.Notify();
392  };
393  device.enqueue(std::move(run_shard));
394  }
395  typename Self::CoeffReturnType finalShard;
396  if (numblocks * blocksize < num_coeffs) {
397  finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
398  num_coeffs - numblocks * blocksize, reducer);
399  } else {
400  finalShard = reducer.initialize();
401  }
402  barrier.Wait();
403 
404  for (Index i = 0; i < numblocks; ++i) {
405  reducer.reduce(shards[i], &finalShard);
406  }
407  *output = reducer.finalize(finalShard);
408  }
409 };
410 
411 #endif
412 
413 // Default inner reducer
414 template <typename Self, typename Op, typename Device>
415 struct InnerReducer {
416  static constexpr bool HasOptimizedImplementation = false;
417 
418  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
419  typename Self::Index, typename Self::Index) {
420  eigen_assert(false && "Not implemented");
421  return true;
422  }
423 };
424 
425 // Default outer reducer
426 template <typename Self, typename Op, typename Device>
427 struct OuterReducer {
428  static constexpr bool HasOptimizedImplementation = false;
429 
430  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
431  typename Self::Index, typename Self::Index) {
432  eigen_assert(false && "Not implemented");
433  return true;
434  }
435 };
436 
437 #ifdef EIGEN_USE_SYCL
438 // Default Generic reducer
439 template <typename Self, typename Op, typename Device>
440 struct GenericReducer {
441  static constexpr bool HasOptimizedImplementation = false;
442 
443  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
444  typename Self::Index, typename Self::Index) {
445  eigen_assert(false && "Not implemented");
446  return true;
447  }
448 };
449 #endif
450 
451 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
452 template <int B, int N, typename S, typename R, typename I_>
453 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
454  unsigned int*);
455 
456 #if defined(EIGEN_HAS_GPU_FP16)
457 template <typename S, typename R, typename I_>
458 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
459  R, const S, I_, internal::packet_traits<half>::type*);
460 template <int B, int N, typename S, typename R, typename I_>
461 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
462  internal::packet_traits<half>::type*);
463 template <int NPT, typename S, typename R, typename I_>
464 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
465 
466 #endif
467 
468 template <int NPT, typename S, typename R, typename I_>
469 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
470 
471 template <int NPT, typename S, typename R, typename I_>
472 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
473 #endif
474 
483 template <typename Op, typename CoeffReturnType>
485 #if defined(EIGEN_USE_SYCL)
486  typedef std::remove_const_t<decltype(std::declval<Op>().initialize())> type;
487 #else
488  typedef std::remove_const_t<CoeffReturnType> type;
489 #endif
490 };
491 
492 } // end namespace internal
493 
500 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
501 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
502  public:
503  typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
504  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
505  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
506  typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
507  typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
508  typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
509 
510  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims)
511  : m_expr(expr), m_dims(dims) {}
512  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer)
513  : m_expr(expr), m_dims(dims), m_reducer(reducer) {}
514 
515  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; }
516  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
517  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& reducer() const { return m_reducer; }
518 
519  protected:
520  typename XprType::Nested m_expr;
521  const Dims m_dims;
522  const Op m_reducer;
523 };
524 
525 template <typename ArgType, typename Device>
526 struct TensorReductionEvaluatorBase;
527 
528 // Eval as rvalue
529 template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
530 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
531  typedef internal::reducer_traits<Op, Device> ReducerTraits;
532  typedef Dims ReducedDims;
533  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
534  typedef typename XprType::Index Index;
535  typedef ArgType ChildType;
536  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
537  static constexpr int NumInputDims = internal::array_size<InputDimensions>::value;
538  static constexpr int NumReducedDims = internal::array_size<Dims>::value;
539  static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
540  typedef std::conditional_t<NumOutputDims == 0, Sizes<>, DSizes<Index, NumOutputDims> > Dimensions;
541  typedef typename XprType::Scalar Scalar;
542  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
543  static constexpr bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
544  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
545  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
546  static constexpr Index PacketSize = PacketType<CoeffReturnType, Device>::size;
547 
548  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
549  typedef StorageMemory<CoeffReturnType, Device> Storage;
550  typedef typename Storage::Type EvaluatorPointerType;
551 
552  // Subset of strides of the input tensor for the non-reduced dimensions.
553  // Indexed by output dimensions.
554  static constexpr int NumPreservedStrides = max_n_1<NumOutputDims>::size;
555 
556  // For full reductions
557 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
558  static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
559  static constexpr bool RunningOnSycl = false;
560 #elif defined(EIGEN_USE_SYCL)
561  static constexpr bool RunningOnSycl = internal::is_same<internal::remove_all_t<Device>, Eigen::SyclDevice>::value;
562  static constexpr bool RunningOnGPU = false;
563 #else
564  static constexpr bool RunningOnGPU = false;
565  static constexpr bool RunningOnSycl = false;
566 #endif
567 
568  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
569  enum {
570  IsAligned = false,
571  PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
572  BlockAccess = false,
573  PreferBlockAccess = true,
574  CoordAccess = false, // to be implemented
575  RawAccess = false
576  };
577 
578  typedef std::remove_const_t<Scalar> ScalarNoConst;
579 
580  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
581  typedef internal::TensorBlockNotImplemented TensorBlock;
582  //===--------------------------------------------------------------------===//
583 
584  static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
585  static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
586  static constexpr bool RunningFullReduction = (NumOutputDims == 0);
587 
588  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
589  : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) {
590  EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
591  EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
592  YOU_MADE_A_PROGRAMMING_MISTAKE);
593 
594  // Build the bitmap indicating if an input dimension is reduced or not.
595  for (int i = 0; i < NumInputDims; ++i) {
596  m_reduced[i] = false;
597  }
598  for (int i = 0; i < NumReducedDims; ++i) {
599  eigen_assert(op.dims()[i] >= 0);
600  eigen_assert(op.dims()[i] < NumInputDims);
601  m_reduced[op.dims()[i]] = true;
602  }
603 
604  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
605  internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
606 
607  // Precompute output strides.
608  if (NumOutputDims > 0) {
609  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
610  m_outputStrides[0] = 1;
611  for (int i = 1; i < NumOutputDims; ++i) {
612  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
613  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
614  }
615  } else {
616  m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
617  for (int i = NumOutputDims - 2; i >= 0; --i) {
618  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
619  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
620  }
621  }
622  }
623 
624  // Precompute input strides.
625  if (NumInputDims > 0) {
626  array<Index, NumInputDims> input_strides;
627  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
628  input_strides[0] = 1;
629  for (int i = 1; i < NumInputDims; ++i) {
630  input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
631  }
632  } else {
633  input_strides.back() = 1;
634  for (int i = NumInputDims - 2; i >= 0; --i) {
635  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
636  }
637  }
638 
639  int outputIndex = 0;
640  int reduceIndex = 0;
641  for (int i = 0; i < NumInputDims; ++i) {
642  if (m_reduced[i]) {
643  m_reducedStrides[reduceIndex] = input_strides[i];
644  ++reduceIndex;
645  } else {
646  m_preservedStrides[outputIndex] = input_strides[i];
647  m_output_to_input_dim_map[outputIndex] = i;
648  ++outputIndex;
649  }
650  }
651  }
652 
653  // Special case for full reductions
654  if (NumOutputDims == 0) {
655  m_preservedStrides[0] = internal::array_prod(input_dims);
656  }
657 
658  m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims)
659  : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
660  ? m_preservedStrides[0]
661  : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
662  }
663 
664  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
665 
666  EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
667  // Use the FullReducer if possible.
668  if ((RunningFullReduction && RunningOnSycl) ||
669  (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
670  ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
671  bool need_assign = false;
672  if (!data) {
673  m_result = static_cast<EvaluatorPointerType>(
674  m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
675  data = m_result;
676  need_assign = true;
677  }
678  Op reducer(m_reducer);
679  internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
680  return need_assign;
681  }
682 
683  // Attempt to use an optimized reduction.
684  else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
685  bool reducing_inner_dims = true;
686  for (int i = 0; i < NumReducedDims; ++i) {
687  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
688  reducing_inner_dims &= m_reduced[i];
689  } else {
690  reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
691  }
692  }
693  if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
694  (reducing_inner_dims || ReducingInnerMostDims)) {
695  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
696  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
697  if (!data) {
698  if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
699  num_values_to_reduce > 128) ||
700  (RunningOnSycl)) {
701  data = static_cast<EvaluatorPointerType>(m_device.get(
702  (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
703  m_result = data;
704  } else {
705  return true;
706  }
707  }
708  Op reducer(m_reducer);
709  // For SYCL this if always return false
710  if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
711  num_coeffs_to_preserve)) {
712  if (m_result) {
713  m_device.deallocate_temp(m_result);
714  m_result = NULL;
715  }
716  return true;
717  } else {
718  return (m_result != NULL);
719  }
720  }
721 
722  bool preserving_inner_dims = true;
723  for (int i = 0; i < NumReducedDims; ++i) {
724  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
725  preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
726  } else {
727  preserving_inner_dims &= m_reduced[i];
728  }
729  }
730  if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) {
731  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
732  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
733  if (!data) {
734  if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
735  num_values_to_reduce > 32) ||
736  (RunningOnSycl)) {
737  data = static_cast<EvaluatorPointerType>(m_device.get(
738  (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
739  m_result = data;
740  } else {
741  return true;
742  }
743  }
744  Op reducer(m_reducer);
745  // For SYCL this if always return false
746  if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
747  num_coeffs_to_preserve)) {
748  if (m_result) {
749  m_device.deallocate_temp(m_result);
750  m_result = NULL;
751  }
752  return true;
753  } else {
754  return (m_result != NULL);
755  }
756  }
757 #if defined(EIGEN_USE_SYCL)
758  // If there is no Optimised version for SYCL, the reduction expression
759  // must break into two subexpression and use the SYCL generic Reducer on the device.
760  if (RunningOnSycl) {
761  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
762  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
763  if (!data) {
764  data = static_cast<EvaluatorPointerType>(
765  m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
766  m_result = data;
767  }
768  Op reducer(m_reducer);
769  internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
770  num_coeffs_to_preserve);
771  return (m_result != NULL);
772  }
773 #endif
774  }
775  return true;
776  }
777 
778 #ifdef EIGEN_USE_THREADS
779  template <typename EvalSubExprsCallback>
780  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
781  m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); });
782  }
783 #endif
784 
785  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
786  m_impl.evalSubExprsIfNeeded(NULL);
787  return evalSubExprsIfNeededCommon(data);
788  }
789 
790  EIGEN_STRONG_INLINE void cleanup() {
791  m_impl.cleanup();
792  if (m_result) {
793  m_device.deallocate_temp(m_result);
794  m_result = NULL;
795  }
796  }
797 
798  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
799  if ((RunningFullReduction || RunningOnGPU) && m_result) {
800  return *(m_result + index);
801  }
802  Op reducer(m_reducer);
803  if (ReducingInnerMostDims || RunningFullReduction) {
804  const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
805  ? m_preservedStrides[0]
806  : m_preservedStrides[NumPreservedStrides - 1];
807  return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), num_values_to_reduce, reducer);
808  } else {
809  typename Self::CoeffReturnType accum = reducer.initialize();
810  internal::GenericDimReducer<NumReducedDims - 1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
811  return reducer.finalize(accum);
812  }
813  }
814 
815  // TODO(bsteiner): provide a more efficient implementation.
816  template <int LoadMode>
817  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
818  eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
819 
820  if (RunningOnGPU && m_result) {
821  return internal::pload<PacketReturnType>(m_result + index);
822  }
823 
824  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
825  if (ReducingInnerMostDims) {
826  const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
827  ? m_preservedStrides[0]
828  : m_preservedStrides[NumPreservedStrides - 1];
829  const Index firstIndex = firstInput(index);
830  for (Index i = 0; i < PacketSize; ++i) {
831  Op reducer(m_reducer);
832  values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
833  num_values_to_reduce, reducer);
834  }
835  } else if (PreservingInnerMostDims) {
836  const Index firstIndex = firstInput(index);
837  const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
838  // TBD: extend this the the n innermost dimensions that we preserve.
839  if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
840  Op reducer(m_reducer);
841  typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
842  internal::InnerMostDimPreserver<NumReducedDims - 1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
843  return reducer.finalizePacket(accum);
844  } else {
845  for (int i = 0; i < PacketSize; ++i) {
846  values[i] = coeff(index + i);
847  }
848  }
849  } else {
850  for (int i = 0; i < PacketSize; ++i) {
851  values[i] = coeff(index + i);
852  }
853  }
854  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
855  return rslt;
856  }
857 
858  // Must be called after evalSubExprsIfNeeded().
859  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
860  if (RunningFullReduction && m_result) {
861  return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
862  } else {
863  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
864  const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
865  return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
866  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
867  }
868  }
869 
870  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
871  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
872  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
873 
874  private:
875  template <int, typename, typename>
876  friend struct internal::GenericDimReducer;
877  template <typename, typename, bool, bool>
878  friend struct internal::InnerMostDimReducer;
879  template <int, typename, typename, bool>
880  friend struct internal::InnerMostDimPreserver;
881  template <typename S, typename O, typename D, bool V>
882  friend struct internal::FullReducer;
883 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
884  template <int B, int N, typename S, typename R, typename I_>
885  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
886 #if defined(EIGEN_HAS_GPU_FP16)
887  template <typename S, typename R, typename I_>
888  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
889  internal::packet_traits<Eigen::half>::type*);
890  template <int B, int N, typename S, typename R, typename I_>
891  KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*,
892  internal::packet_traits<Eigen::half>::type*);
893  template <int NPT, typename S, typename R, typename I_>
894  KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
895 #endif
896  template <int NPT, typename S, typename R, typename I_>
897  KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
898 
899  template <int NPT, typename S, typename R, typename I_>
900  KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
901 #endif
902 
903 #if defined(EIGEN_USE_SYCL)
904  template <typename Evaluator_, typename Op__>
905  friend class TensorSycl::internal::GenericNondeterministicReducer;
906  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
907  template <typename, typename, typename>
908  friend struct internal::GenericReducer;
909 #endif
910 
911  template <typename S, typename O, typename D>
912  friend struct internal::InnerReducer;
913 
914  struct BlockIteratorState {
915  Index input_dim;
916  Index output_size;
917  Index output_count;
918  };
919 
920  // Returns the Index in the input tensor of the first value that needs to be
921  // used to compute the reduction at output index "index".
922  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
923  if (ReducingInnerMostDims) {
924  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
925  return index * m_preservedStrides[0];
926  } else {
927  return index * m_preservedStrides[NumPreservedStrides - 1];
928  }
929  }
930  // TBD: optimize the case where we preserve the innermost dimensions.
931  Index startInput = 0;
932  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
933  for (int i = NumOutputDims - 1; i > 0; --i) {
934  // This is index_i in the output tensor.
935  const Index idx = index / m_outputStrides[i];
936  startInput += idx * m_preservedStrides[i];
937  index -= idx * m_outputStrides[i];
938  }
939  if (PreservingInnerMostDims) {
940  eigen_assert(m_preservedStrides[0] == 1);
941  startInput += index;
942  } else {
943  startInput += index * m_preservedStrides[0];
944  }
945  } else {
946  for (int i = 0; i < NumOutputDims - 1; ++i) {
947  // This is index_i in the output tensor.
948  const Index idx = index / m_outputStrides[i];
949  startInput += idx * m_preservedStrides[i];
950  index -= idx * m_outputStrides[i];
951  }
952  if (PreservingInnerMostDims) {
953  eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
954  startInput += index;
955  } else {
956  startInput += index * m_preservedStrides[NumPreservedStrides - 1];
957  }
958  }
959  return startInput;
960  }
961 
962  // Bitmap indicating if an input dimension is reduced or not.
963  array<bool, NumInputDims> m_reduced;
964  // Dimensions of the output of the operation.
965  Dimensions m_dimensions;
966  // Precomputed strides for the output tensor.
967  // Avoid zero-sized arrays, since element access fails to compile on GPU.
968  array<Index, (std::max)(NumOutputDims, 1)> m_outputStrides;
969  array<internal::TensorIntDivisor<Index>, (std::max)(NumOutputDims, 1)> m_fastOutputStrides;
970  array<Index, (std::max)(NumPreservedStrides, 1)> m_preservedStrides;
971  // Map from output to input dimension index.
972  array<Index, (std::max)(NumOutputDims, 1)> m_output_to_input_dim_map;
973  // How many values go into each reduction
974  Index m_numValuesToReduce;
975 
976  // Subset of strides of the input tensor for the reduced dimensions.
977  // Indexed by reduced dimensions.
978  array<Index, NumReducedDims> m_reducedStrides;
979  // Size of the input dimensions that are reduced.
980  // Indexed by reduced dimensions.
981  array<Index, NumReducedDims> m_reducedDims;
982 
983  // Evaluator for the input expression.
984  TensorEvaluator<ArgType, Device> m_impl;
985 
986  // Operation to apply for computing the reduction.
987  Op m_reducer;
988 
989  EvaluatorPointerType m_result;
990 
991  const Device EIGEN_DEVICE_REF m_device;
992 };
993 
994 template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
995 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
996  : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
997  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
998  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {}
999 };
1000 
1001 template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
1002 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
1003  : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
1004  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
1005  Base;
1006  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device)
1007  : Base(op, device) {}
1008  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
1009  // kernel
1010  // Therefore the coeff function should be overridden by for SYCL kernel
1011  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
1012  return *(this->data() + index);
1013  }
1014  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
1015  // kernel
1016  // Therefore the packet function should be overridden by for SYCL kernel
1017  template <int LoadMode>
1018  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
1019  return internal::pload<typename Base::PacketReturnType>(this->data() + index);
1020  }
1021 };
1022 
1023 } // end namespace Eigen
1024 
1025 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: TensorReduction.h:484
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
Tensor reduction class.
Definition: TensorForwardDeclarations.h:81
The tensor base class.
Definition: TensorForwardDeclarations.h:68