$darkmode
Eigen-unsupported  5.0.1-dev
TensorConvolution.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
21 class IndexMapper {
22  public:
23  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
24  const array<Index, NumKernelDims>& indices) {
25  array<Index, NumDims> dimensions = input_dims;
26  for (int i = 0; i < NumKernelDims; ++i) {
27  const Index index = indices[i];
28  const Index input_dim = input_dims[index];
29  const Index kernel_dim = kernel_dims[i];
30  const Index result_dim = input_dim - kernel_dim + 1;
31  dimensions[index] = result_dim;
32  }
33 
34  array<Index, NumDims> inputStrides;
35  array<Index, NumDims> outputStrides;
36  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
37  inputStrides[0] = 1;
38  outputStrides[0] = 1;
39  for (int i = 1; i < NumDims; ++i) {
40  inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
41  outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];
42  }
43  } else {
44  inputStrides[NumDims - 1] = 1;
45  outputStrides[NumDims - 1] = 1;
46  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
47  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
48  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
49  }
50  }
51 
52  array<Index, NumDims> gpuInputDimensions;
53  array<Index, NumDims> gpuOutputDimensions;
54  array<Index, NumDims> tmp = dimensions;
55  array<Index, NumDims> ordering;
56  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
57  for (int i = 0; i < NumKernelDims; ++i) {
58  const Index index = i + offset;
59  ordering[index] = indices[i];
60  tmp[indices[i]] = -1;
61  gpuInputDimensions[index] = input_dims[indices[i]];
62  gpuOutputDimensions[index] = dimensions[indices[i]];
63  }
64 
65  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0;
66  for (int i = 0; i < NumDims; ++i) {
67  if (tmp[i] >= 0) {
68  ordering[written] = i;
69  gpuInputDimensions[written] = input_dims[i];
70  gpuOutputDimensions[written] = dimensions[i];
71  ++written;
72  }
73  }
74 
75  for (int i = 0; i < NumDims; ++i) {
76  m_inputStrides[i] = inputStrides[ordering[i]];
77  m_outputStrides[i] = outputStrides[ordering[i]];
78  }
79 
80  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
81  for (int i = 0; i < NumDims; ++i) {
82  if (i > NumKernelDims) {
83  m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
84  m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
85  } else {
86  m_gpuInputStrides[i] = 1;
87  m_gpuOutputStrides[i] = 1;
88  }
89  }
90  } else {
91  for (int i = NumDims - 1; i >= 0; --i) {
92  if (static_cast<size_t>(i + 1) < offset) {
93  m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
94  m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
95  } else {
96  m_gpuInputStrides[i] = 1;
97  m_gpuOutputStrides[i] = 1;
98  }
99  }
100  }
101  }
102 
103  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
104  Index inputIndex = 0;
105  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
106  for (int d = NumDims - 1; d > NumKernelDims; --d) {
107  const Index idx = p / m_gpuInputStrides[d];
108  inputIndex += idx * m_inputStrides[d];
109  p -= idx * m_gpuInputStrides[d];
110  }
111  if (NumKernelDims < NumDims) {
112  inputIndex += p * m_inputStrides[NumKernelDims];
113  }
114  } else {
115  std::ptrdiff_t limit = 0;
116  if (NumKernelDims < NumDims) {
117  limit = NumDims - NumKernelDims - 1;
118  }
119  for (int d = 0; d < limit; ++d) {
120  const Index idx = p / m_gpuInputStrides[d];
121  inputIndex += idx * m_inputStrides[d];
122  p -= idx * m_gpuInputStrides[d];
123  }
124  inputIndex += p * m_inputStrides[limit];
125  }
126  return inputIndex;
127  }
128 
129  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
130  Index outputIndex = 0;
131  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
132  for (int d = NumDims - 1; d > NumKernelDims; --d) {
133  const Index idx = p / m_gpuOutputStrides[d];
134  outputIndex += idx * m_outputStrides[d];
135  p -= idx * m_gpuOutputStrides[d];
136  }
137  if (NumKernelDims < NumDims) {
138  outputIndex += p * m_outputStrides[NumKernelDims];
139  }
140  } else {
141  std::ptrdiff_t limit = 0;
142  if (NumKernelDims < NumDims) {
143  limit = NumDims - NumKernelDims - 1;
144  }
145  for (int d = 0; d < limit; ++d) {
146  const Index idx = p / m_gpuOutputStrides[d];
147  outputIndex += idx * m_outputStrides[d];
148  p -= idx * m_gpuOutputStrides[d];
149  }
150  outputIndex += p * m_outputStrides[limit];
151  }
152  return outputIndex;
153  }
154 
155  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
156  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
157  return i * m_inputStrides[offset];
158  }
159 
160  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
161  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
162  return i * m_outputStrides[offset];
163  }
164 
165  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
166  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
167  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
168  }
169 
170  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
171  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
172  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
173  }
174 
175  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
176  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
177  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];
178  }
179 
180  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
181  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
182  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];
183  }
184 
185  private:
186  static constexpr int NumDims = internal::array_size<InputDims>::value;
187  array<Index, NumDims> m_inputStrides;
188  array<Index, NumDims> m_outputStrides;
189  array<Index, NumDims> m_gpuInputStrides;
190  array<Index, NumDims> m_gpuOutputStrides;
191 };
192 
193 template <typename Dimensions, typename InputXprType, typename KernelXprType>
194 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {
195  // Type promotion to handle the case where the types of the lhs and the rhs are different.
196  typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar;
197  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
198  typename traits<KernelXprType>::StorageKind>::ret StorageKind;
199  typedef typename promote_index_type<typename traits<InputXprType>::Index, typename traits<KernelXprType>::Index>::type
200  Index;
201  typedef typename InputXprType::Nested LhsNested;
202  typedef typename KernelXprType::Nested RhsNested;
203  typedef std::remove_reference_t<LhsNested> LhsNested_;
204  typedef std::remove_reference_t<RhsNested> RhsNested_;
205  static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
206  static constexpr int Layout = traits<InputXprType>::Layout;
207  typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
208  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>
209  PointerType;
210 
211  enum { Flags = 0 };
212 };
213 
214 template <typename Dimensions, typename InputXprType, typename KernelXprType>
215 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {
216  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
217 };
218 
219 template <typename Dimensions, typename InputXprType, typename KernelXprType>
220 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,
221  typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {
222  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
223 };
224 
225 } // end namespace internal
226 
230 template <typename Indices, typename InputXprType, typename KernelXprType>
232  : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
233  public:
234  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
235  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
236  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
237  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
238  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
239  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
240  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
241 
242  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel,
243  const Indices& dims)
244  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
245 
246  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; }
247 
249  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename InputXprType::Nested>& inputExpression()
250  const {
251  return m_input_xpr;
252  }
253 
254  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression()
255  const {
256  return m_kernel_xpr;
257  }
258 
259  protected:
260  typename InputXprType::Nested m_input_xpr;
261  typename KernelXprType::Nested m_kernel_xpr;
262  const Indices m_indices;
263 };
264 
265 template <typename Indices, typename InputArgType, typename KernelArgType, typename Device>
266 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> {
267  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
268 
269  static constexpr int NumDims =
270  internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
271  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
272  typedef typename XprType::Index Index;
273  typedef DSizes<Index, NumDims> Dimensions;
274 
275  typedef typename XprType::Scalar Scalar;
276  typedef typename XprType::CoeffReturnType CoeffReturnType;
277  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
278  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
279  typedef StorageMemory<Scalar, Device> Storage;
280  typedef typename Storage::Type EvaluatorPointerType;
281 
282  static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;
283  enum {
284  IsAligned =
285  int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
286  PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) &
287  int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
288  BlockAccess = false,
289  PreferBlockAccess = false,
290  CoordAccess = false, // to be implemented
291  RawAccess = false
292  };
293 
294  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
295  typedef internal::TensorBlockNotImplemented TensorBlock;
296  //===--------------------------------------------------------------------===//
297 
298  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
299  : m_inputImpl(op.inputExpression(), device),
300  m_kernelImpl(op.kernelExpression(), device),
301  m_kernelArg(op.kernelExpression()),
302  m_kernel(NULL),
303  m_local_kernel(false),
304  m_device(device) {
305  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==
306  static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)),
307  YOU_MADE_A_PROGRAMMING_MISTAKE);
308 
309  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
310  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
311 
312  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
313  m_inputStride[0] = 1;
314  for (int i = 1; i < NumDims; ++i) {
315  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
316  }
317  } else {
318  m_inputStride[NumDims - 1] = 1;
319  for (int i = NumDims - 2; i >= 0; --i) {
320  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
321  }
322  }
323 
324  m_dimensions = m_inputImpl.dimensions();
325  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
326  for (int i = 0; i < NumKernelDims; ++i) {
327  const Index index = op.indices()[i];
328  const Index input_dim = input_dims[index];
329  const Index kernel_dim = kernel_dims[i];
330  const Index result_dim = input_dim - kernel_dim + 1;
331  m_dimensions[index] = result_dim;
332  if (i > 0) {
333  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
334  } else {
335  m_kernelStride[0] = 1;
336  }
337  m_indexStride[i] = m_inputStride[index];
338  }
339 
340  m_outputStride[0] = 1;
341  for (int i = 1; i < NumDims; ++i) {
342  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
343  }
344  } else {
345  for (int i = NumKernelDims - 1; i >= 0; --i) {
346  const Index index = op.indices()[i];
347  const Index input_dim = input_dims[index];
348  const Index kernel_dim = kernel_dims[i];
349  const Index result_dim = input_dim - kernel_dim + 1;
350  m_dimensions[index] = result_dim;
351  if (i < NumKernelDims - 1) {
352  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
353  } else {
354  m_kernelStride[NumKernelDims - 1] = 1;
355  }
356  m_indexStride[i] = m_inputStride[index];
357  }
358 
359  m_outputStride[NumDims - 1] = 1;
360  for (int i = NumDims - 2; i >= 0; --i) {
361  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
362  }
363  }
364  }
365 
366  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
367 
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
369  m_inputImpl.evalSubExprsIfNeeded(NULL);
370  preloadKernel();
371  return true;
372  }
373  EIGEN_STRONG_INLINE void cleanup() {
374  m_inputImpl.cleanup();
375  if (m_local_kernel) {
376  m_device.deallocate((void*)m_kernel);
377  m_local_kernel = false;
378  }
379  m_kernel = NULL;
380  }
381 
382  void evalTo(typename XprType::Scalar* buffer) {
383  evalSubExprsIfNeeded(NULL);
384  for (int i = 0; i < dimensions().TotalSize(); ++i) {
385  buffer[i] += coeff(i);
386  }
387  cleanup();
388  }
389 
390  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
391  CoeffReturnType result = CoeffReturnType(0);
392  convolve(firstInput(index), 0, NumKernelDims - 1, result);
393  return result;
394  }
395 
396  template <int LoadMode>
397  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const {
398  Index indices[2] = {index, index + PacketSize - 1};
399  Index startInputs[2] = {0, 0};
400  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
401  for (int i = NumDims - 1; i > 0; --i) {
402  const Index idx0 = indices[0] / m_outputStride[i];
403  const Index idx1 = indices[1] / m_outputStride[i];
404  startInputs[0] += idx0 * m_inputStride[i];
405  startInputs[1] += idx1 * m_inputStride[i];
406  indices[0] -= idx0 * m_outputStride[i];
407  indices[1] -= idx1 * m_outputStride[i];
408  }
409  } else {
410  for (int i = 0; i < NumDims - 1; ++i) {
411  const Index idx0 = indices[0] / m_outputStride[i];
412  const Index idx1 = indices[1] / m_outputStride[i];
413  startInputs[0] += idx0 * m_inputStride[i];
414  startInputs[1] += idx1 * m_inputStride[i];
415  indices[0] -= idx0 * m_outputStride[i];
416  indices[1] -= idx1 * m_outputStride[i];
417  }
418  }
419  startInputs[0] += indices[0];
420  startInputs[1] += indices[1];
421 
422  if (startInputs[1] - startInputs[0] == PacketSize - 1) {
423  PacketReturnType result = internal::pset1<PacketReturnType>(0);
424  convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
425  return result;
426  } else {
427  EIGEN_ALIGN_MAX Scalar data[PacketSize];
428  data[0] = Scalar(0);
429  convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);
430  for (int i = 1; i < PacketSize - 1; ++i) {
431  data[i] = Scalar(0);
432  convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);
433  }
434  data[PacketSize - 1] = Scalar(0);
435  convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);
436  return internal::pload<PacketReturnType>(data);
437  }
438  }
439 
440  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
441  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
442  // We ignore the use of fused multiply-add.
443  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
444  const double firstIndex_compute_cost =
445  NumDims *
446  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
447  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
448  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
449  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
450  }
451 
452  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
453 
454  private:
455  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
456  Index startInput = 0;
457  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
458  for (int i = NumDims - 1; i > 0; --i) {
459  const Index idx = index / m_outputStride[i];
460  startInput += idx * m_inputStride[i];
461  index -= idx * m_outputStride[i];
462  }
463  } else {
464  for (int i = 0; i < NumDims - 1; ++i) {
465  const Index idx = index / m_outputStride[i];
466  startInput += idx * m_inputStride[i];
467  index -= idx * m_outputStride[i];
468  }
469  }
470  startInput += index;
471  return startInput;
472  }
473 
474  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
475  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
476  const Index input = firstIndex + j * m_indexStride[DimIndex];
477  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
478  if (DimIndex > 0) {
479  convolve(input, kernel, DimIndex - 1, accum);
480  } else {
481  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
482  }
483  }
484  }
485 
486  template <typename Packet>
487  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
488  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
489  const Index input = firstIndex + j * m_indexStride[DimIndex];
490  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
491  if (DimIndex > 0) {
492  convolvePacket(input, kernel, DimIndex - 1, accum);
493  } else {
494  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
495  internal::pset1<Packet>(m_kernel[kernel]), accum);
496  }
497  }
498  }
499 
500  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
501  // Don't make a local copy of the kernel unless we have to (i.e. it's an
502  // expression that needs to be evaluated)
503  const Scalar* in_place = m_kernelImpl.data();
504  if (in_place) {
505  m_kernel = in_place;
506  m_local_kernel = false;
507  } else {
508  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
509  Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
510  typedef TensorEvalToOp<const KernelArgType> EvalTo;
511  EvalTo evalToTmp(local, m_kernelArg);
512  const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
513  internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
514 
515  m_kernel = local;
516  m_local_kernel = true;
517  }
518  }
519 
520  array<Index, NumDims> m_inputStride;
521  array<Index, NumDims> m_outputStride;
522 
523  array<Index, NumKernelDims> m_indexStride;
524  array<Index, NumKernelDims> m_kernelStride;
525  TensorEvaluator<InputArgType, Device> m_inputImpl;
526  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
527  Dimensions m_dimensions;
528 
529  KernelArgType m_kernelArg;
530  const Scalar* m_kernel;
531  bool m_local_kernel;
532  const Device EIGEN_DEVICE_REF m_device;
533 };
534 
535 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
536 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
537 
538 template <int StaticKernelSize>
539 struct GetKernelSize {
540  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; }
541 };
542 template <>
543 struct GetKernelSize<Dynamic> {
544  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }
545 };
546 
547 template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
548 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
549  InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
550  const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,
551  float* buffer) {
552 #if defined(EIGEN_HIPCC)
553  HIP_DYNAMIC_SHARED(float, s)
554 #else
555  extern __shared__ float s[];
556 #endif
557 
558  const int first_x = blockIdx.x * maxX;
559  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
560  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
561  const int num_x_output = last_x - first_x + 1;
562 
563  const int first_plane = blockIdx.y * blockDim.y;
564  const int plane_stride = blockDim.y * gridDim.y;
565 
566  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
567  // Load inputs to shared memory
568  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
569  const int plane_kernel_offset = threadIdx.y * num_x_input;
570 #pragma unroll
571  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
572  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
573  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
574  }
575 
576  __syncthreads();
577 
578  // Compute the convolution
579  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
580 
581 #pragma unroll
582  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
583  const int kernel_offset = plane_kernel_offset + i;
584  float result = 0.0f;
585 #pragma unroll
586  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
587  result += s[k + kernel_offset] * kernel[k];
588  }
589  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
590  buffer[tensor_index] = result;
591  }
592  __syncthreads();
593  }
594 };
595 
596 template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
597 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
598  InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
599  const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,
600  const int kernelSizeX, const int kernelSizeY, float* buffer) {
601 #if defined(EIGEN_HIPCC)
602  HIP_DYNAMIC_SHARED(float, s)
603 #else
604  extern __shared__ float s[];
605 #endif
606 
607  const int first_x = blockIdx.x * maxX;
608  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
609  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
610  const int num_x_output = last_x - first_x + 1;
611 
612  const int first_y = blockIdx.y * maxY;
613  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
614  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
615  const int num_y_output = last_y - first_y + 1;
616 
617  const int first_plane = blockIdx.z * blockDim.z;
618  const int plane_stride = blockDim.z * gridDim.z;
619 
620  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
621  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
622  const int plane_kernel_offset = threadIdx.z * num_y_input;
623 
624 // Load inputs to shared memory
625 #pragma unroll
626  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
627  const int input_offset = num_x_input * (j + plane_kernel_offset);
628 #pragma unroll
629  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
630  const int tensor_index =
631  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
632  s[i + input_offset] = eval.coeff(tensor_index);
633  }
634  }
635 
636  __syncthreads();
637 
638  // Convolution
639  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
640 
641 #pragma unroll
642  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
643 #pragma unroll
644  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
645  float result = 0.0f;
646 #pragma unroll
647  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
648  const int kernel_offset = kernelSizeX * l;
649  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
650 #pragma unroll
651  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
652  result += s[k + input_offset] * kernel[k + kernel_offset];
653  }
654  }
655  const int tensor_index =
656  plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
657  buffer[tensor_index] = result;
658  }
659  }
660 
661  __syncthreads();
662  }
663 };
664 
665 template <typename InputEvaluator, typename Index, typename InputDims>
666 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
667  InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
668  const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
669  const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
670  const size_t kernelSizeZ, float* buffer) {
671 #if defined(EIGEN_HIPCC)
672  HIP_DYNAMIC_SHARED(float, s)
673 #else
674  extern __shared__ float s[];
675 #endif
676 
677  // Load inputs to shared memory
678  const int first_x = blockIdx.x * maxX;
679  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
680  const int num_x_input = last_x - first_x + kernelSizeX;
681 
682  const int first_y = blockIdx.y * maxY;
683  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
684  const int num_y_input = last_y - first_y + kernelSizeY;
685 
686  const int first_z = blockIdx.z * maxZ;
687  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
688  const int num_z_input = last_z - first_z + kernelSizeZ;
689 
690  for (int p = 0; p < numPlanes; ++p) {
691  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
692  const int plane_kernel_offset = 0;
693 
694  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
695  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
696  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
697  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
698  i + first_x, j + first_y, k + first_z);
699  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
700  }
701  }
702  }
703 
704  __syncthreads();
705 
706  // Convolution
707  const int num_z_output = last_z - first_z + 1;
708  const int num_y_output = last_y - first_y + 1;
709  const int num_x_output = last_x - first_x + 1;
710  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
711 
712  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
713  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
714  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
715  float result = 0.0f;
716  for (int n = 0; n < kernelSizeZ; ++n) {
717  for (int m = 0; m < kernelSizeY; ++m) {
718  for (int l = 0; l < kernelSizeX; ++l) {
719  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
720  kernel[l + kernelSizeX * (m + kernelSizeY * n)];
721  }
722  }
723  }
724  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
725  i + first_x, j + first_y, k + first_z);
726  buffer[tensor_index] = result;
727  }
728  }
729  }
730  __syncthreads();
731  }
732 };
733 
734 template <typename Indices, typename InputArgType, typename KernelArgType>
735 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
736  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
737 
738  static constexpr int NumDims =
739  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
740  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
741  typedef typename XprType::Index Index;
742  typedef DSizes<Index, NumDims> Dimensions;
743  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
744 
745  static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
746  enum {
747  IsAligned =
748  TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
749  PacketAccess = false,
750  BlockAccess = false,
751  PreferBlockAccess = false,
752  CoordAccess = false, // to be implemented
753  RawAccess = false
754  };
755 
756  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
757  typedef internal::TensorBlockNotImplemented TensorBlock;
758  //===--------------------------------------------------------------------===//
759 
760  TensorEvaluator(const XprType& op, const GpuDevice& device)
761  : m_inputImpl(op.inputExpression(), device),
762  m_kernelImpl(op.kernelExpression(), device),
763  m_kernelArg(op.kernelExpression()),
764  m_indices(op.indices()),
765  m_buf(NULL),
766  m_kernel(NULL),
767  m_local_kernel(false),
768  m_device(device) {
769  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
770  static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),
771  YOU_MADE_A_PROGRAMMING_MISTAKE);
772 
773  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
774  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
775 
776  m_dimensions = m_inputImpl.dimensions();
777  for (int i = 0; i < NumKernelDims; ++i) {
778  const Index index = op.indices()[i];
779  const Index input_dim = input_dims[index];
780  const Index kernel_dim = kernel_dims[i];
781  const Index result_dim = input_dim - kernel_dim + 1;
782  m_dimensions[index] = result_dim;
783  }
784  }
785 
786  typedef typename XprType::CoeffReturnType CoeffReturnType;
787  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
788  typedef typename InputArgType::Scalar Scalar;
789  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
790 
791  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
792 
793  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
794  preloadKernel();
795  m_inputImpl.evalSubExprsIfNeeded(NULL);
796  if (data) {
797  executeEval(data);
798  return false;
799  } else {
800  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
801  executeEval(m_buf);
802  return true;
803  }
804  }
805 
806  EIGEN_STRONG_INLINE void cleanup() {
807  m_inputImpl.cleanup();
808  if (m_buf) {
809  m_device.deallocate(m_buf);
810  m_buf = NULL;
811  }
812  if (m_local_kernel) {
813  m_device.deallocate((void*)m_kernel);
814  m_local_kernel = false;
815  }
816  m_kernel = NULL;
817  }
818 
819  EIGEN_STRONG_INLINE void preloadKernel() {
820  // Don't make a local copy of the kernel unless we have to (i.e. it's an
821  // expression that needs to be evaluated)
822  const Scalar* in_place = m_kernelImpl.data();
823  if (in_place) {
824  m_kernel = in_place;
825  m_local_kernel = false;
826  } else {
827  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
828  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
829  typedef TensorEvalToOp<const KernelArgType> EvalTo;
830  EvalTo evalToTmp(local, m_kernelArg);
831  const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
832  internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
833 
834  m_kernel = local;
835  m_local_kernel = true;
836  }
837  }
838 
839  static unsigned int ceil(unsigned int num, unsigned int denom) {
840  const unsigned int rounded_toward_zero = num / denom;
841  if (num > rounded_toward_zero * denom) {
842  return rounded_toward_zero + 1;
843  }
844  return rounded_toward_zero;
845  }
846 
847  void executeEval(Scalar* data) const {
848  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
849 
850  const int maxSharedMem = m_device.sharedMemPerBlock();
851  const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
852  const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
853  const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
854  const int warpSize = 32;
855 
856  switch (NumKernelDims) {
857  case 1: {
858  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
859 
860  const int numX = dimensions()[m_indices[0]];
861  const int numP = dimensions().TotalSize() / numX;
862  int maxX;
863  dim3 block_size;
864 
865  const int single_stride_dim =
866  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
867  if (m_indices[0] == single_stride_dim) {
868  // Maximum the reuse
869  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
870  maxX = numext::mini<int>(inner_dim, numX);
871  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
872  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
873  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
874  } else {
875  // Read as much as possible alongside the inner most dimension, that is the plane
876  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
877  const int maxP = numext::mini<int>(inner_dim, numP);
878  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
879 
880  block_size.x = numext::mini(warpSize, maxX);
881  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
882  }
883 
884  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
885  gpu_assert(shared_mem <= maxSharedMem);
886 
887  const int num_x_blocks = ceil(numX, maxX);
888  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
889  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
890 
891  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
892 
893  // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
894  // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "
895  // << shared_mem << " in stream " << m_device.stream() << endl;
896 
897  const array<Index, 1> indices{m_indices[0]};
898  const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
899  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
900  switch (kernel_size) {
901  case 4: {
902  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
903  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
904  numX, maxX, 4, data);
905  break;
906  }
907  case 7: {
908  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
909  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
910  numX, maxX, 7, data);
911  break;
912  }
913  default: {
914  LAUNCH_GPU_KERNEL(
915  (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
916  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
917  kernel_size, data);
918  }
919  }
920  break;
921  }
922 
923  case 2: {
924  const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
925  const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
926  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
927  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
928 
929  const int numX = dimensions()[m_indices[idxX]];
930  const int numY = dimensions()[m_indices[idxY]];
931  const int numP = dimensions().TotalSize() / (numX * numY);
932 
933  const float scaling_factor =
934  sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
935 
936  // Snap maxX to warp size
937  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
938  const int maxX = numext::mini<int>(inner_dim, numX);
939  const int maxY =
940  numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941  const int maxP = numext::mini<int>(
942  maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
943 
944  dim3 block_size;
945  block_size.x = numext::mini(1024, maxX);
946  block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
947  block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
948 
949  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
950  gpu_assert(shared_mem <= maxSharedMem);
951 
952  const int num_x_blocks = ceil(numX, maxX);
953  const int num_y_blocks = ceil(numY, maxY);
954  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
955  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
956 
957  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
958 
959  // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
960  // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
961  // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "
962  // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
963 
964  const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
965  const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
966  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
967  switch (kernel_size_x) {
968  case 4: {
969  switch (kernel_size_y) {
970  case 7: {
971  LAUNCH_GPU_KERNEL(
972  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
973  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
974  numY, maxY, 4, 7, data);
975  break;
976  }
977  default: {
978  LAUNCH_GPU_KERNEL(
979  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
980  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
981  numY, maxY, 4, kernel_size_y, data);
982  break;
983  }
984  }
985  break;
986  }
987  case 7: {
988  switch (kernel_size_y) {
989  case 4: {
990  LAUNCH_GPU_KERNEL(
991  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
992  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
993  numY, maxY, 7, 4, data);
994  break;
995  }
996  default: {
997  LAUNCH_GPU_KERNEL(
998  (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
999  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
1000  numY, maxY, 7, kernel_size_y, data);
1001  break;
1002  }
1003  }
1004  break;
1005  }
1006  default: {
1007  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
1008  Dynamic, Dynamic>),
1009  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
1010  numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1011  break;
1012  }
1013  }
1014  break;
1015  }
1016 
1017  case 3: {
1018  const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1019  const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1020  const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1021 
1022  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1023  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1024  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1025 
1026  const int numX = dimensions()[m_indices[idxX]];
1027  const int numY = dimensions()[m_indices[idxY]];
1028  const int numZ = dimensions()[m_indices[idxZ]];
1029  const int numP = dimensions().TotalSize() / (numX * numY * numZ);
1030 
1031  const int maxX = numext::mini<int>(
1032  128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
1033  numX));
1034  const int maxY = numext::mini<int>(
1035  128, numext::mini<int>(
1036  maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
1037  numY));
1038  const int maxZ = numext::mini<int>(
1039  128, numext::mini<int>(
1040  maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
1041  kernel_size_z + 1,
1042  numZ));
1043 
1044  dim3 block_size;
1045  block_size.x = numext::mini(32, maxX);
1046  block_size.y = numext::mini(32, maxY);
1047  block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
1048  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1049 
1050  const int shared_mem =
1051  (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1052  gpu_assert(shared_mem <= maxSharedMem);
1053 
1054  // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
1055  // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
1056  // " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<
1057  // endl;
1058  const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
1059  const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
1060  m_kernelImpl.dimensions()[idxZ]};
1061  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
1062 
1063  LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
1064  num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
1065  maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1066  break;
1067  }
1068 
1069  default: {
1070  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
1071  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1072  }
1073  }
1074  }
1075 
1076  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
1077  eigen_assert(m_buf);
1078  eigen_assert(index < m_dimensions.TotalSize());
1079  return m_buf[index];
1080  }
1081 
1082  template <int LoadMode>
1083  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
1084  eigen_assert(m_buf);
1085  eigen_assert(index < m_dimensions.TotalSize());
1086  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
1087  }
1088 
1089  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
1090  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1091  // model.
1092  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1093  // We ignore the use of fused multiply-add.
1094  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1095  const double firstIndex_compute_cost =
1096  NumDims *
1097  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
1098  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1099  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
1100  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
1101  }
1102 
1103  private:
1104  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1105  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1106  KernelArgType m_kernelArg;
1107  Indices m_indices;
1108  Dimensions m_dimensions;
1109  Scalar* m_buf;
1110  const Scalar* m_kernel;
1111  bool m_local_kernel;
1112 
1113  const GpuDevice& m_device;
1114 };
1115 #endif
1116 
1117 } // end namespace Eigen
1118 
1119 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Definition: TensorConvolution.h:249
Namespace containing all symbols from the Eigen library.
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor base class.
Definition: TensorForwardDeclarations.h:68
Definition: TensorConvolution.h:231
const int Dynamic