$darkmode
Eigen-unsupported  5.0.1-dev
TensorConvolutionSycl.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 // IWYU pragma: private
19 #include "./InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 
23 enum class convolution_type { CONV1D, CONV2D, CONV3D };
24 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
25  typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
26 struct EigenConvolutionKernel;
27 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
28  typename Kernel_accessor, typename Buffer_accessor>
29 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
30  Buffer_accessor, convolution_type::CONV1D> {
31  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
32  Local_accessor;
33  Local_accessor local_acc;
34  Evaluator device_evaluator;
35  Kernel_accessor kernel_filter;
36  Buffer_accessor buffer_acc;
37  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
38  const size_t kernelSize;
39  const cl::sycl::range<2> input_range;
40  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
41  Buffer_accessor buffer_acc_,
42  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
43  const size_t kernelSize_, const cl::sycl::range<2> input_range_)
44  : local_acc(local_acc_),
45  device_evaluator(device_evaluator_),
46  kernel_filter(kernel_filter_),
47  buffer_acc(buffer_acc_),
48  indexMapper(indexMapper_),
49  kernelSize(kernelSize_),
50  input_range(input_range_) {}
51 
52  template <typename BooleanDim2>
53  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
54  return (boolean_check[0] && boolean_check[1]);
55  }
56  void operator()(cl::sycl::nd_item<2> itemID) const {
57  auto buffer_ptr = buffer_acc;
58  auto kernel_ptr = kernel_filter;
59  // the required row to be calculated for the for each plane in shered memory
60  const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
61  const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
62  const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
63  const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
65  for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
66  const size_t local_index = i + plane_kernel_offset;
67  const size_t tensor_index =
68  plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
69 
70  local_acc[local_index] =
71  (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
72  ? device_evaluator.coeff(tensor_index)
73  : CoeffReturnType(0);
74  }
75 
76  itemID.barrier(cl::sycl::access::fence_space::local_space);
77 
78  // calculate the convolution // output start x
79  const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
80  if (boundary_check(itemID.get_global_id() < input_range)) {
81  CoeffReturnType result = static_cast<CoeffReturnType>(0);
82  const size_t index = plane_kernel_offset + itemID.get_local_id(0);
83  for (size_t k = 0; k < kernelSize; ++k) {
84  result += (local_acc[k + index] * kernel_ptr[k]);
85  }
86  const size_t tensor_index =
87  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
88  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
89  buffer_ptr[tensor_index] = result;
90  }
91  }
92 };
93 
94 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
95  typename Kernel_accessor, typename Buffer_accessor>
96 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
97  Buffer_accessor, convolution_type::CONV2D> {
98  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
99  Local_accessor;
100  Local_accessor local_acc;
101  Evaluator device_evaluator;
102  Kernel_accessor kernel_filter;
103  Buffer_accessor buffer_acc;
104  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
105  const cl::sycl::range<2> kernel_size;
106  const cl::sycl::range<3> input_range;
107  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
108  Buffer_accessor buffer_acc_,
109  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
110  const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
111  : local_acc(local_acc_),
112  device_evaluator(device_evaluator_),
113  kernel_filter(kernel_filter_),
114  buffer_acc(buffer_acc_),
115  indexMapper(indexMapper_),
116  kernel_size(kernel_size_),
117  input_range(input_range_) {}
118  template <typename BooleanDim3>
119  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
120  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
121  }
122 
123  void operator()(cl::sycl::nd_item<3> itemID) const {
124  auto buffer_ptr = buffer_acc;
125  auto kernel_ptr = kernel_filter;
126  // the required row to be calculated for the for each plane in shered memory
127  const auto num_input = cl::sycl::range<2>{
128  (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
129 
130  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
131  const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
132 
133  const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
134  itemID.get_group(1) * itemID.get_local_range()[1]};
135 
136  // fill the local memory
137  bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
138  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
139  const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
140  bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
141  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
142  const size_t local_index = i + local_input_offset;
143  const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
144  i + input_offset[0], j + input_offset[1]);
145  local_acc[local_index] =
146  (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2)
147  ? device_evaluator.coeff(tensor_index)
148  : CoeffReturnType(0);
149  }
150  }
151 
152  itemID.barrier(cl::sycl::access::fence_space::local_space);
153 
154  // output offset start for each thread
155  const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
156  itemID.get_group(1) * itemID.get_local_range()[1]};
157 
158  if (boundary_check(itemID.get_global_id() < input_range)) {
159  CoeffReturnType result = static_cast<CoeffReturnType>(0);
160 
161  for (size_t j = 0; j < kernel_size[1]; j++) {
162  size_t kernel_offset = kernel_size[0] * j;
163  const size_t index =
164  (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
165  for (size_t i = 0; i < kernel_size[0]; i++) {
166  result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
167  }
168  }
169  const size_t tensor_index =
170  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
171  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
172  itemID.get_local_id(1) + output_offset[1]);
173 
174  buffer_ptr[tensor_index] = result;
175  }
176  }
177 };
178 
179 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
180  typename Kernel_accessor, typename Buffer_accessor>
181 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
182  Buffer_accessor, convolution_type::CONV3D> {
183  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
184  Local_accessor;
185  Local_accessor local_acc;
186  Evaluator device_evaluator;
187  Kernel_accessor kernel_filter;
188  Buffer_accessor buffer_acc;
189  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
190  const cl::sycl::range<3> kernel_size;
191  const cl::sycl::range<3> input_range;
192  const size_t numP;
193 
194  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
195  Buffer_accessor buffer_acc_,
196  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
197  const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
198  const size_t numP_)
199  : local_acc(local_acc_),
200  device_evaluator(device_evaluator_),
201  kernel_filter(kernel_filter_),
202  buffer_acc(buffer_acc_),
203  indexMapper(indexMapper_),
204  kernel_size(kernel_size_),
205  input_range(input_range_),
206  numP(numP_) {}
207  template <typename BooleanDim3>
208  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
209  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
210  }
211  void operator()(cl::sycl::nd_item<3> itemID) const {
212  auto buffer_ptr = buffer_acc;
213  auto kernel_ptr = kernel_filter;
214  const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
215 
216  const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
217 
218  const auto output_offset =
219  cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
220 
221  for (size_t p = 0; p < numP; p++) {
223  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
224  for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
225  size_t local_index_dim2 = num_input[0] * num_input[1] * k;
226  bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
227  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
228  bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
229  size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
230  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
231  bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
232  const size_t local_index = local_index_dim1 + i;
233  const size_t tensor_index =
234  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
235  i + input_offset[0], j + input_offset[1], k + input_offset[2]);
236  local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
237  }
238  }
239  }
240  itemID.barrier(cl::sycl::access::fence_space::local_space);
241 
242  // calculate the convolution
243 
244  if (boundary_check(itemID.get_global_id() < input_range)) {
245  CoeffReturnType result = static_cast<CoeffReturnType>(0);
246  for (size_t k = 0; k < kernel_size[2]; k++) {
247  for (size_t j = 0; j < kernel_size[1]; j++) {
248  for (size_t i = 0; i < kernel_size[0]; i++) {
249  const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
250  const size_t local_index =
251  ((i + itemID.get_local_id(0)) +
252  num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
253 
254  result += (local_acc[local_index] * kernel_ptr[kernel_index]);
255  }
256  }
257  }
258  const size_t tensor_index =
259  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
260  indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
261  buffer_ptr[tensor_index] = result;
262  }
263 
264  itemID.barrier(cl::sycl::access::fence_space::local_space);
265  }
266  }
267 };
268 
269 template <typename Indices, typename InputArgType, typename KernelArgType>
270 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
271  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
272 
273  static constexpr int NumDims =
274  internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
275  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
276  typedef typename XprType::Index Index;
277  typedef DSizes<Index, NumDims> Dimensions;
278  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
279  typedef const Eigen::SyclDevice Device;
280  typedef typename XprType::CoeffReturnType CoeffReturnType;
281  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
282  typedef typename InputArgType::Scalar Scalar;
283  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
284  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
285  typedef typename Storage::Type EvaluatorPointerType;
286  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
287 
288  static constexpr int Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout;
289  enum {
290  IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
291  TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
292  PacketAccess = false,
293  BlockAccess = false,
294  PreferBlockAccess = false,
295  CoordAccess = false, // to be implemented
296  RawAccess = false
297  };
298 
299  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
300  typedef internal::TensorBlockNotImplemented TensorBlock;
301  //===--------------------------------------------------------------------===//
302 
303  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
304  : m_inputImpl(op.inputExpression(), device),
305  m_kernelArg(op.kernelExpression()),
306  m_kernelImpl(op.kernelExpression(), device),
307  m_indices(op.indices()),
308  m_buf(NULL),
309  m_kernel(NULL),
310  m_local_kernel(false),
311  m_device(device) {
312  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
313  static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
314  YOU_MADE_A_PROGRAMMING_MISTAKE);
315 
316  const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
317  const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
318  m_kernelImpl.dimensions();
319 
320  m_dimensions = m_inputImpl.dimensions();
321  for (int i = 0; i < NumKernelDims; ++i) {
322  const Index index = op.indices()[i];
323  const Index input_dim = input_dims[index];
324  const Index kernel_dim = kernel_dims[i];
325  const Index result_dim = input_dim - kernel_dim + 1;
326  m_dimensions[index] = result_dim;
327  }
328  }
329 
330  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
331 
332  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
333  preloadKernel();
334  m_inputImpl.evalSubExprsIfNeeded(NULL);
335  if (data) {
336  executeEval(data);
337  return false;
338  } else {
339  m_buf = (EvaluatorPointerType)m_device.get(
340  (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
341  executeEval(m_buf);
342  return true;
343  }
344  }
345 
346  EIGEN_STRONG_INLINE void cleanup() {
347  m_inputImpl.cleanup();
348  if (m_buf) {
349  m_device.deallocate_temp(m_buf);
350  m_buf = NULL;
351  }
352  if (m_local_kernel) {
353  m_device.deallocate_temp(m_kernel);
354  m_local_kernel = false;
355  }
356  m_kernel = NULL;
357  }
359  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
361  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
362 
363  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
364  // Don't make a local copy of the kernel unless we have to (i.e. it's an
365  // expression that needs to be evaluated)
366  typename KernelStorage::Type in_place = m_kernelImpl.data();
367  if (in_place) {
368  m_kernel = in_place;
369  m_local_kernel = false;
370  } else {
371  ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
372  EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
373  typedef TensorEvalToOp<const KernelArgType> EvalTo;
374  EvalTo evalToTmp(m_device.get(local), m_kernelArg);
375  const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
376  internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
377  m_kernel = local;
378  m_local_kernel = true;
379  }
380  }
381 
382  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
383  typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
384  typedef typename InputEvaluator::Dimensions InputDims;
385  switch (NumKernelDims) {
386  case 1: {
387  const size_t numX = dimensions()[m_indices[0]];
388  const size_t numP = dimensions().TotalSize() / numX;
389  const auto input_dim = std::array<size_t, 2>{numX, numP};
390  auto global_range = cl::sycl::range<2>{1, 1};
391  auto local_range = cl::sycl::range<2>{1, 1};
392  const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
393 
394  m_device.parallel_for_setup(input_dim, global_range, local_range);
395  const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
396  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
397  const array<Index, 1> indices{{m_indices[0]}};
398  const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
399  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
400 
401  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
402  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
403  ConvKernel;
404 
405  m_device
406  .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
407  m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
408  indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]))
409  .wait();
410  break;
411  }
412 
413  case 2: {
414  auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
415  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
416  auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
417  (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
418  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
419  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
420  const size_t numP = dimensions().TotalSize() / (numX * numY);
421  auto input_dim = std::array<size_t, 3>{numX, numY, numP};
422 
423  auto global_range = cl::sycl::range<3>{1, 1, 1};
424  auto local_range = cl::sycl::range<3>{1, 1, 1};
425 
426  m_device.parallel_for_setup(input_dim, global_range, local_range);
427 
428  const size_t local_memory_size =
429  (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
430  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
431  const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
432  const array<Index, 2> kernel_dims{
433  {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
434  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
435  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
436  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
437  ConvKernel;
438  m_device
439  .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
440  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
441  indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]})
442  .wait();
443  break;
444  }
445 
446  case 3: {
447  auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
448  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
449  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
450 
451  auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
452  (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
453  (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
454 
455  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
456  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
457  const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
458  auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
459  const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
460 
461  const array<Index, 3> indices{
462  {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
463  const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
464  m_kernelImpl.dimensions()[kernel_index[1]],
465  m_kernelImpl.dimensions()[kernel_index[2]]}};
466 
467  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
468 
469  auto global_range = cl::sycl::range<3>{1, 1, 1};
470  auto local_range = cl::sycl::range<3>{1, 1, 1};
471 
472  m_device.parallel_for_setup(input_dim, global_range, local_range);
473  auto local_memory_range = (local_range + kernel_size - 1);
474  const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
475 
476  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
477  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
478  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
479  ConvKernel;
480  m_device
481  .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483  indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP)
484  .wait();
485  break;
486  }
487 
488  default: {
489  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
490  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
491  }
492  }
493  }
494 
495  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
496  eigen_assert(m_buf != NULL);
497  eigen_assert(index < m_dimensions.TotalSize());
498  return m_buf[index];
499  }
500 
501  template <int LoadMode>
502  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
503  eigen_assert(m_buf != NULL);
504  eigen_assert(index < m_dimensions.TotalSize());
505  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
506  }
507 
508  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
509  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
510  // model.
511  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
512  // We ignore the use of fused multiply-add.
513  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
514  const double firstIndex_compute_cost =
515  NumDims *
516  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
517  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
518  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
519  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
520  }
521 
522  private:
523  // No assignment (copies are needed by the kernels)
524  TensorEvaluator &operator=(const TensorEvaluator &);
525  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
526  KernelArgType m_kernelArg;
527  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
528  Indices m_indices;
529  Dimensions m_dimensions;
530  EvaluatorPointerType m_buf;
531  typename KernelStorage::Type m_kernel;
532  bool m_local_kernel;
533  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
534 }; // namespace Eigen
535 
536 } // end namespace Eigen
537 
538 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index