$darkmode
Eigen-unsupported  5.0.1-dev
TensorReductionSycl.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorReductionSycl.h
15  *
16  * \brief:
17  * This is the specialization of the reduction operation. Two phase reduction approach
18  * is used since the GPU does not have Global Synchronization for global memory among
19  * different work-group/thread block. To solve the problem, we need to create two kernels
20  * to reduce the data, where the first kernel reduce the data locally and each local
21  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25  *
26  *****************************************************************/
27 
28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30 // IWYU pragma: private
31 #include "./InternalHeaderCheck.h"
32 
33 namespace Eigen {
34 namespace TensorSycl {
35 namespace internal {
36 
37 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
38 struct OpDefiner {
39  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
40  typedef Op type;
41  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
42 
43  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
44  const Index &) {
45  return accumulator;
46  }
47 };
48 
49 template <typename CoeffReturnType, typename Index>
50 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
51  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
52  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
53  return type();
54  }
55 
56  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
57  const Index &scale) {
58  ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
59  return quotient_op(accumulator, CoeffReturnType(scale));
60  }
61 };
62 
63 template <typename CoeffReturnType, typename Index>
64 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
65  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
66  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
67  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
68  return type();
69  }
70 
71  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
72  const Index &scale) {
73  return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
74  }
75 };
76 
77 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
78  Index local_range>
79 struct SecondStepFullReducer {
80  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
81  LocalAccessor;
82  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
83  typedef typename OpDef::type Op;
84  LocalAccessor scratch;
85  InputAccessor aI;
86  OutputAccessor outAcc;
87  Op op;
88  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
89  : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
90 
91  void operator()(cl::sycl::nd_item<1> itemID) const {
92  // Our empirical research shows that the best performance will be achieved
93  // when there is only one element per thread to reduce in the second step.
94  // in this step the second step reduction time is almost negligible.
95  // Hence, in the second step of reduction the input size is fixed to the
96  // local size, thus, there is only one element read per thread. The
97  // algorithm must be changed if the number of reduce per thread in the
98  // second step is greater than 1. Otherwise, the result will be wrong.
99  const Index localid = itemID.get_local_id(0);
100  auto aInPtr = aI + localid;
101  auto aOutPtr = outAcc;
102  CoeffReturnType *scratchptr = scratch.get_pointer();
103  CoeffReturnType accumulator = *aInPtr;
104 
105  scratchptr[localid] = op.finalize(accumulator);
106  for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
107  itemID.barrier(cl::sycl::access::fence_space::local_space);
108  if (localid < offset) {
109  op.reduce(scratchptr[localid + offset], &accumulator);
110  scratchptr[localid] = op.finalize(accumulator);
111  }
112  }
113  if (localid == 0) *aOutPtr = op.finalize(accumulator);
114  }
115 };
116 
117 // Full reduction first phase. In this version the vectorization is true and the reduction accept
118 // any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
119 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
120 class FullReductionKernelFunctor {
121  public:
122  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
123  typedef typename Evaluator::Index Index;
124  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
125  (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
126  OpDef;
127 
128  typedef typename OpDef::type Op;
129  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
130  typedef typename Evaluator::PacketReturnType PacketReturnType;
131  typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
132  CoeffReturnType>
133  OutType;
134  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
135  LocalAccessor;
136  LocalAccessor scratch;
137  Evaluator evaluator;
138  EvaluatorPointerType final_output;
139  Index rng;
140  Op op;
141 
142  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
143  Index rng_, OpType op_)
144  : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
145 
146  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
147 
148  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
149  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
150  const cl::sycl::nd_item<1> &itemID) const {
151  auto output_ptr = final_output;
152  Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
153  Index globalid = itemID.get_global_id(0);
154  Index localid = itemID.get_local_id(0);
155  Index step = Evaluator::PacketSize * itemID.get_global_range(0);
156  Index start = Evaluator::PacketSize * globalid;
157  // vectorizable parts
158  PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
159  for (Index i = start; i < VectorizedRange; i += step) {
160  op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
161  }
162  globalid += VectorizedRange;
163  // non vectorizable parts
164  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
165  op.template reducePacket<PacketReturnType>(
166  ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
167  evaluator.impl().coeff(i), op.initialize()),
168  &packetAccumulator);
169  }
170  scratch[localid] = packetAccumulator =
171  OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
172  // reduction parts // Local size is always power of 2
173  EIGEN_UNROLL_LOOP
174  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
175  itemID.barrier(cl::sycl::access::fence_space::local_space);
176  if (localid < offset) {
177  op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
178  scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
179  }
180  }
181  if (localid == 0) {
182  output_ptr[itemID.get_group(0)] =
183  op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
184  }
185  }
186 
187  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
188  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
189  const cl::sycl::nd_item<1> &itemID) const {
190  auto output_ptr = final_output;
191  Index globalid = itemID.get_global_id(0);
192  Index localid = itemID.get_local_id(0);
193  // vectorizable parts
194  CoeffReturnType accumulator = op.initialize();
195  // non vectorizable parts
196  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
197  op.reduce(evaluator.impl().coeff(i), &accumulator);
198  }
199  scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
200 
201  // reduction parts. the local size is always power of 2
202  EIGEN_UNROLL_LOOP
203  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
204  itemID.barrier(cl::sycl::access::fence_space::local_space);
205  if (localid < offset) {
206  op.reduce(scratch[localid + offset], &accumulator);
207  scratch[localid] = op.finalize(accumulator);
208  }
209  }
210  if (localid == 0) {
211  output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
212  }
213  }
214 };
215 
216 template <typename Evaluator, typename OpType>
217 class GenericNondeterministicReducer {
218  public:
219  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
220  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
221  typedef typename Evaluator::Index Index;
222  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
223  typedef typename OpDef::type Op;
224  template <typename Scratch>
225  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
226  Index range_, Index num_values_to_reduce_)
227  : evaluator(evaluator_),
228  output_accessor(output_accessor_),
229  functor(OpDef::get_op(functor_)),
230  range(range_),
231  num_values_to_reduce(num_values_to_reduce_) {}
232 
233  void operator()(cl::sycl::nd_item<1> itemID) const {
234  // This is to bypass the statefull condition in Eigen meanReducer
235  Op non_const_functor;
236  std::memcpy(&non_const_functor, &functor, sizeof(Op));
237  auto output_accessor_ptr = output_accessor;
238  Index globalid = static_cast<Index>(itemID.get_global_linear_id());
239  if (globalid < range) {
240  CoeffReturnType accum = functor.initialize();
241  Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
242  evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
243  output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
244  }
245  }
246 
247  private:
248  Evaluator evaluator;
249  EvaluatorPointerType output_accessor;
250  Op functor;
251  Index range;
252  Index num_values_to_reduce;
253 };
254 
255 enum class reduction_dim { inner_most, outer_most };
256 // default is preserver
257 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
258 struct PartialReductionKernel {
259  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
260  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
261  typedef typename Evaluator::Index Index;
262  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
263  typedef typename OpDef::type Op;
264  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
265  ScratchAcc;
266  ScratchAcc scratch;
267  Evaluator evaluator;
268  EvaluatorPointerType output_accessor;
269  Op op;
270  const Index preserve_elements_num_groups;
271  const Index reduce_elements_num_groups;
272  const Index num_coeffs_to_preserve;
273  const Index num_coeffs_to_reduce;
274 
275  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
276  const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
277  const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
278  : scratch(scratch_),
279  evaluator(evaluator_),
280  output_accessor(output_accessor_),
281  op(OpDef::get_op(op_)),
282  preserve_elements_num_groups(preserve_elements_num_groups_),
283  reduce_elements_num_groups(reduce_elements_num_groups_),
284  num_coeffs_to_preserve(num_coeffs_to_preserve_),
285  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
286 
287  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
288  CoeffReturnType &accumulator) const {
289  if (globalPId >= num_coeffs_to_preserve) {
290  return;
291  }
292  Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
293  : globalRId + (globalPId * num_coeffs_to_reduce);
294  Index localOffset = globalRId;
295 
296  const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
297  const Index per_thread_global_stride =
298  rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
299  for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
300  op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
301  localOffset += per_thread_local_stride;
302  global_offset += per_thread_global_stride;
303  }
304  }
305  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
306  const Index linearLocalThreadId = itemID.get_local_id(0);
307  Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
308  : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
309  Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
310  : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
311  const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
312  : itemID.get_group(0) / reduce_elements_num_groups;
313  const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
314  : itemID.get_group(0) % reduce_elements_num_groups;
315 
316  Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
317  const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
318  CoeffReturnType *scratchPtr = scratch.get_pointer();
319  auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
320  CoeffReturnType accumulator = op.initialize();
321 
322  element_wise_reduce(globalRId, globalPId, accumulator);
323 
324  accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
325  scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
326  accumulator;
327  if (rt == reduction_dim::inner_most) {
328  pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
329  rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
330  globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
331  }
332 
333  /* Apply the reduction operation between the current local
334  * id and the one on the other half of the vector. */
335  auto out_scratch_ptr =
336  scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
337  itemID.barrier(cl::sycl::access::fence_space::local_space);
338  if (rt == reduction_dim::inner_most) {
339  accumulator = *out_scratch_ptr;
340  }
341  // The Local LocalThreadSizeR is always power of 2
342  EIGEN_UNROLL_LOOP
343  for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
344  if (rLocalThreadId < offset) {
345  op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
346  // The result has already been divided for mean reducer in the
347  // previous reduction so no need to divide furthermore
348  *out_scratch_ptr = op.finalize(accumulator);
349  }
350  /* All threads collectively read from global memory into local.
351  * The barrier ensures all threads' IO is resolved before
352  * execution continues (strictly speaking, all threads within
353  * a single work-group - there is no co-ordination between
354  * work-groups, only work-items). */
355  itemID.barrier(cl::sycl::access::fence_space::local_space);
356  }
357 
358  if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
359  outPtr[globalPId] = op.finalize(accumulator);
360  }
361  }
362 };
363 
364 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
365 struct SecondStepPartialReduction {
366  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
367  typedef typename OpDef::type Op;
368  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
369  ScratchAccessor;
370  InputAccessor input_accessor;
371  OutputAccessor output_accessor;
372  Op op;
373  const Index num_coeffs_to_preserve;
374  const Index num_coeffs_to_reduce;
375 
376  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
377  OutputAccessor output_accessor_, OpType op_,
378  const Index num_coeffs_to_preserve_,
379  const Index num_coeffs_to_reduce_)
380  : input_accessor(input_accessor_),
381  output_accessor(output_accessor_),
382  op(OpDef::get_op(op_)),
383  num_coeffs_to_preserve(num_coeffs_to_preserve_),
384  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
385 
386  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
387  const Index globalId = itemID.get_global_id(0);
388 
389  if (globalId >= num_coeffs_to_preserve) return;
390 
391  auto in_ptr = input_accessor + globalId;
392 
393  OutScalar accumulator = op.initialize();
394  // num_coeffs_to_reduce is not bigger that 256
395  for (Index i = 0; i < num_coeffs_to_reduce; i++) {
396  op.reduce(*in_ptr, &accumulator);
397  in_ptr += num_coeffs_to_preserve;
398  }
399  output_accessor[globalId] = op.finalize(accumulator);
400  }
401 }; // namespace internal
402 
403 template <typename Index, Index LTP, Index LTR, bool BC_>
404 struct ReductionPannel {
405  static constexpr Index LocalThreadSizeP = LTP;
406  static constexpr Index LocalThreadSizeR = LTR;
407  static constexpr bool BC = BC_;
408 };
409 
410 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
411 struct PartialReducerLauncher {
412  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
413  typedef typename Self::CoeffReturnType CoeffReturnType;
414  typedef typename Self::Storage Storage;
415  typedef typename Self::Index Index;
416  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
417  PannelParameters;
418 
419  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
420 
421  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
422  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
423  Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
424 
425  // getPowerOfTwo makes sure local range is power of 2 and <=
426  // maxSyclThreadPerBlock this will help us to avoid extra check on the
427  // kernel
428  static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
429  (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
430  "The Local thread size must be a power of 2 for the reduction "
431  "operation");
432 
433  constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
434  // In this step, we force the code not to be more than 2-step reduction:
435  // Our empirical research shows that if each thread reduces at least 64
436  // elements individually, we get better performance. However, this can change
437  // on different platforms. In this step we force the code not to be
438  // morthan step reduction: Our empirical research shows that for inner_most
439  // dim reducer, it is better to have 8 group in a reduce dimension for sizes
440  // > 1024 to achieve the best performance.
441  const Index reductionPerThread = 64;
442  Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
443  const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
444  Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445  const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
446  const Index globalRange = pNumGroups * rNumGroups * localRange;
447 
448  constexpr Index scratchSize =
449  PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
450  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451  if (rNumGroups > 1) {
452  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
453  dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
454  EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
455  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456  self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457  num_coeffs_to_reduce)
458  .wait();
459  typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
460  SecondStepPartialReductionKernel;
461  dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
462  temp_accessor, output,
463  cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464  Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
465  .wait();
466  self.device().deallocate_temp(temp_pointer);
467  } else {
468  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469  self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470  num_coeffs_to_reduce)
471  .wait();
472  }
473  return false;
474  }
475 };
476 } // namespace internal
477 } // namespace TensorSycl
478 
479 namespace internal {
480 
481 template <typename Self, typename Op, bool Vectorizable>
482 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
483  typedef typename Self::CoeffReturnType CoeffReturnType;
484  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
485  static constexpr bool HasOptimizedImplementation = true;
486  static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
487  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
488  typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
489  static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
490  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
491  "The Local thread size must be a power of 2 for the reduction "
492  "operation");
493  constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
494 
495  typename Self::Index inputSize = self.impl().dimensions().TotalSize();
496  // In this step we force the code not to be more than 2-step reduction:
497  // Our empirical research shows that if each thread reduces at least 512
498  // elements individually, we get better performance.
499  const Index reductionPerThread = 2048;
500  // const Index num_work_group =
501  Index reductionGroup = dev.getPowerOfTwo(
502  (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
503  const Index num_work_group = std::min(reductionGroup, local_range);
504  // 1
505  // ? local_range
506  // : 1);
507  const Index global_range = num_work_group * local_range;
508 
509  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
510  typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
511  if (num_work_group > 1) {
512  CoeffReturnType *temp_pointer =
513  static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
514  typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
515  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
516  local_range, inputSize, reducer)
517  .wait();
518  typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
519  EvaluatorPointerType, Index, local_range>
520  GenericRKernel;
521  dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
522  tmp_global_accessor, data,
523  cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
524  num_work_group, reducer)
525  .wait();
526  dev.deallocate_temp(temp_pointer);
527  } else {
528  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
529  reducer)
530  .wait();
531  }
532  }
533 };
534 // vectorizable inner_most most dim preserver
535 // col reduction
536 template <typename Self, typename Op>
537 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
538  static constexpr bool HasOptimizedImplementation = true;
539 
540  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
541  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
542  typename Self::Index num_coeffs_to_preserve) {
543  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
544  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
545  num_coeffs_to_reduce,
546  num_coeffs_to_preserve);
547  }
548 };
549 // row reduction
550 template <typename Self, typename Op>
551 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
552  static constexpr bool HasOptimizedImplementation = true;
553 
554  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
555  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
556  typename Self::Index num_coeffs_to_preserve) {
557  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
558  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
559  num_coeffs_to_reduce,
560  num_coeffs_to_preserve);
561  }
562 };
563 
564 // ArmgMax uses this kernel for partial reduction//
565 // TODO(@mehdi.goli) come up with a better kernel
566 // generic partial reduction
567 template <typename Self, typename Op>
568 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
569  static constexpr bool HasOptimizedImplementation = false;
570  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
571  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
572  typename Self::Index num_coeffs_to_preserve) {
573  typename Self::Index range, GRange, tileSize;
574  dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
575 
576  dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
577  TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
578  self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
579  reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))
580  .wait();
581  return false;
582  }
583 };
584 
585 } // namespace internal
586 } // namespace Eigen
587 
588 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index