$darkmode
Eigen-unsupported  5.0.1-dev
TensorExecutor.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
28 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
29 
30 template <typename Expression>
32  enum { value = false };
33 };
34 
35 template <typename LhsXprType, typename RhsXprType>
36 struct ExpressionHasTensorBroadcastingOp<const TensorAssignOp<LhsXprType, RhsXprType> > {
38 };
39 
40 template <typename UnaryOp, typename XprType>
41 struct ExpressionHasTensorBroadcastingOp<const TensorCwiseUnaryOp<UnaryOp, XprType> > {
42  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
43 };
44 
45 template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
46 struct ExpressionHasTensorBroadcastingOp<const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
47  enum {
48  value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value || ExpressionHasTensorBroadcastingOp<RhsXprType>::value
49  };
50 };
51 
52 template <typename Broadcast, typename XprType>
53 struct ExpressionHasTensorBroadcastingOp<const TensorBroadcastingOp<Broadcast, XprType> > {
54  enum { value = true };
55 };
56 
57 // -------------------------------------------------------------------------- //
58 
75 template <typename Expression, typename Device, bool Vectorizable, TiledEvaluation Tiling>
77  public:
78  typedef typename Expression::Index StorageIndex;
79 
80  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
81  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
82  // violation. If this template is instantiated with a non-default device, it
83  // means that this header file was included without defining
84  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
85  static_assert(std::is_same<Device, DefaultDevice>::value,
86  "Default executor instantiated with non-default device. "
87  "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
88  "EIGEN_USE_SYCL before including Eigen headers.");
89 
90  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) {
91  TensorEvaluator<Expression, Device> evaluator(expr, device);
92  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
93  if (needs_assign) {
94  const StorageIndex size = array_prod(evaluator.dimensions());
95  for (StorageIndex i = 0; i < size; ++i) {
96  evaluator.evalScalar(i);
97  }
98  }
99  evaluator.cleanup();
100  }
101 };
102 
107 template <typename Expression, typename Device, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
109 
113 template <typename Expression>
114 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
115  /*Tiling=*/TiledEvaluation::Off> {
116  public:
117  typedef typename Expression::Index StorageIndex;
118 
119  static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
120  TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
121  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
122  if (needs_assign) {
123  const StorageIndex size = array_prod(evaluator.dimensions());
124  const int PacketSize =
125  unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
126 
127  // Give compiler a strong possibility to unroll the loop. But don't insist
128  // on unrolling, because if the function is expensive compiler should not
129  // unroll the loop at the expense of inlining.
130  const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
131  for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
132  for (StorageIndex j = 0; j < 4; j++) {
133  evaluator.evalPacket(i + j * PacketSize);
134  }
135  }
136  const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
137  for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
138  evaluator.evalPacket(i);
139  }
140  for (StorageIndex i = VectorizedSize; i < size; ++i) {
141  evaluator.evalScalar(i);
142  }
143  }
144  evaluator.cleanup();
145  }
146 };
147 
152 template <typename Expression, bool Vectorizable>
153 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
154  /*Tiling=*/TiledEvaluation::On> {
155  public:
156  typedef typename traits<Expression>::Scalar Scalar;
157  typedef std::remove_const_t<Scalar> ScalarNoConst;
158 
160  typedef typename traits<Expression>::Index StorageIndex;
161 
162  static constexpr int NumDims = traits<Expression>::NumDimensions;
163 
164  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr,
165  const DefaultDevice& device = DefaultDevice()) {
166  typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex> TensorBlockMapper;
167 
168  typedef internal::TensorBlockDescriptor<NumDims, StorageIndex> TensorBlockDesc;
169  typedef internal::TensorBlockScratchAllocator<DefaultDevice> TensorBlockScratch;
170 
171  Evaluator evaluator(expr, device);
172 
173  // TODO(ezhulenev): Do not use tiling for small tensors?
174  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
175 
176  if (needs_assign) {
177  // Query expression tree for desired block size/shape.
178  const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
179 
180  const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements);
181 
182  // Share scratch memory allocator between all blocks.
183  TensorBlockScratch scratch(device);
184 
185  const StorageIndex total_block_count = block_mapper.blockCount();
186  for (StorageIndex i = 0; i < total_block_count; ++i) {
187  TensorBlockDesc desc = block_mapper.blockDescriptor(i);
188  evaluator.evalBlock(desc, scratch);
189  scratch.reset();
190  }
191  }
192  evaluator.cleanup();
193  }
194 };
195 
207 #ifdef EIGEN_USE_THREADS
208 
209 template <typename TensorBlockMapper>
210 struct TensorExecutorTilingContext {
211  TensorExecutorTilingContext() = default;
212  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size)
213  : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {}
214 
215  TensorBlockMapper block_mapper; // navigate through blocks
216  TensorOpCost cost; // cost of computing a single block
217  size_t aligned_blocksize; // block size after memory alignment
218 };
219 
220 // Computes a block evaluation parameters, and allocates temporary memory buffer
221 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
222 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
223 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(const Evaluator& evaluator) {
224  // Query expression tree for desired block size/shape.
225  TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
226 
227  // Update target block size based on cost model.
228  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, requirements.cost_per_coeff);
229  requirements.size = static_cast<size_t>(1.0 / taskSize);
230 
231  TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements);
232 
233  size_t block_size = block_mapper.blockTotalSize();
234  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
235  const size_t aligned_blocksize =
236  align * numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
237 
238  return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize};
239 }
240 
241 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
242 struct EvalRange {
243  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
244  Evaluator evaluator = *evaluator_in;
245  eigen_assert(lastIdx >= firstIdx);
246  for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
247  evaluator.evalScalar(i);
248  }
249  }
250 
251  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
252 };
253 
254 template <typename Evaluator, typename StorageIndex>
255 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
256  static constexpr int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
257 
258  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
259  Evaluator evaluator = *evaluator_in;
260  eigen_assert(lastIdx >= firstIdx);
261  StorageIndex i = firstIdx;
262  if (lastIdx - firstIdx >= PacketSize) {
263  eigen_assert(firstIdx % PacketSize == 0);
264  StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
265  // Give compiler a strong possibility to unroll the loop. But don't insist
266  // on unrolling, because if the function is expensive compiler should not
267  // unroll the loop at the expense of inlining.
268  for (; i <= last_chunk_offset; i += 4 * PacketSize) {
269  for (StorageIndex j = 0; j < 4; j++) {
270  evaluator.evalPacket(i + j * PacketSize);
271  }
272  }
273  last_chunk_offset = lastIdx - PacketSize;
274  for (; i <= last_chunk_offset; i += PacketSize) {
275  evaluator.evalPacket(i);
276  }
277  }
278  for (; i < lastIdx; ++i) {
279  evaluator.evalScalar(i);
280  }
281  }
282 
283  static StorageIndex alignBlockSize(StorageIndex size) {
284  // Align block size to packet size and account for unrolling in run above.
285  if (size >= 16 * PacketSize) {
286  return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
287  }
288  // Aligning to 4 * PacketSize would increase block size by more than 25%.
289  return (size + PacketSize - 1) & ~(PacketSize - 1);
290  }
291 };
292 
293 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
294 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
295  public:
296  typedef typename Expression::Index StorageIndex;
297 
298  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
299  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
300  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
301 
302  Evaluator evaluator(expr, device);
303  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
304  if (needs_assign) {
305  const StorageIndex size = array_prod(evaluator.dimensions());
306  device.parallelFor(
307  size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
308  [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); });
309  }
310  evaluator.cleanup();
311  }
312 };
313 
314 template <typename Expression, bool Vectorizable>
315 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
316  /*Tiling=*/TiledEvaluation::On> {
317  public:
318  typedef typename traits<Expression>::Index IndexType;
319  typedef typename traits<Expression>::Scalar Scalar;
320  typedef std::remove_const_t<Scalar> ScalarNoConst;
321 
322  static constexpr int NumDims = traits<Expression>::NumDimensions;
323 
324  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
325  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
326  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
327 
328  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
329  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
330 
331  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
332  Evaluator evaluator(expr, device);
333 
334  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
335  if (needs_assign) {
336  const TilingContext tiling =
337  internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(evaluator);
338 
339  auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) {
340  TensorBlockScratch scratch(device);
341 
342  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
343  TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
344  evaluator.evalBlock(desc, scratch);
345  scratch.reset();
346  }
347  };
348 
349  // Evaluate small expressions directly as a single block.
350  if (tiling.block_mapper.blockCount() == 1) {
351  TensorBlockScratch scratch(device);
352  TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
353  evaluator.evalBlock(desc, scratch);
354  } else {
355  device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, std::move(eval_block));
356  }
357  }
358  evaluator.cleanup();
359  }
360 };
361 
362 template <typename Expression, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
363 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, Tiling> {
364  public:
365  typedef typename Expression::Index StorageIndex;
366  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
367 
368  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
369  TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
370 
371  const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
372  if (!need_assign) {
373  delete ctx;
374  return;
375  }
376 
377  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
378  const StorageIndex size = array_prod(ctx->evaluator.dimensions());
379  device.parallelForAsync(
380  size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
381  [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
382  [ctx]() { delete ctx; });
383  };
384 
385  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
386  }
387 
388  private:
389  struct TensorAsyncExecutorContext {
390  TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
391  : evaluator(expr, thread_pool), on_done(std::move(done)) {}
392 
393  ~TensorAsyncExecutorContext() {
394  evaluator.cleanup();
395  on_done();
396  }
397 
398  Evaluator evaluator;
399 
400  private:
401  DoneCallback on_done;
402  };
403 };
404 
405 template <typename Expression, typename DoneCallback, bool Vectorizable>
406 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, /*Tileable*/ TiledEvaluation::On> {
407  public:
408  typedef typename traits<Expression>::Index IndexType;
409  typedef typename traits<Expression>::Scalar Scalar;
410  typedef std::remove_const_t<Scalar> ScalarNoConst;
411 
412  static constexpr int NumDims = traits<Expression>::NumDimensions;
413 
414  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
415  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
416  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
417 
418  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
419  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
420 
421  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
422  TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
423 
424  const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
425  if (!need_assign) {
426  delete ctx;
427  return;
428  }
429 
430  ctx->tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
431 
432  auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
433  TensorBlockScratch scratch(ctx->device);
434 
435  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
436  TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx);
437  ctx->evaluator.evalBlock(desc, scratch);
438  scratch.reset();
439  }
440  };
441 
442  // Evaluate small expressions directly as a single block.
443  if (ctx->tiling.block_mapper.blockCount() == 1) {
444  TensorBlockScratch scratch(ctx->device);
445  TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
446  ctx->evaluator.evalBlock(desc, scratch);
447  delete ctx;
448  } else {
449  ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block,
450  [ctx]() { delete ctx; });
451  }
452  };
453 
454  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
455  }
456 
457  private:
458  struct TensorAsyncExecutorContext {
459  TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
460  : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {}
461 
462  ~TensorAsyncExecutorContext() {
463  evaluator.cleanup();
464  on_done();
465  }
466 
467  const ThreadPoolDevice& device;
468  Evaluator evaluator;
469  TilingContext tiling;
470 
471  private:
472  DoneCallback on_done;
473  };
474 };
475 
476 #endif // EIGEN_USE_THREADS
477 
478 // GPU: the evaluation of the expression is offloaded to a GPU.
479 #if defined(EIGEN_USE_GPU)
480 
481 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
482 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
483  public:
484  typedef typename Expression::Index StorageIndex;
485  static void run(const Expression& expr, const GpuDevice& device);
486 };
487 
488 #if defined(EIGEN_GPUCC)
489 // Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
490 template <typename Index>
491 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) {
492  const Index highest = NumTraits<Index>::highest();
493  const Index lowest = NumTraits<Index>::lowest();
494  if (lhs > 0 && rhs > 0) {
495  return lhs > highest - rhs ? 1 : 0;
496  } else if (lhs < 0 && rhs < 0) {
497  return lhs < lowest - rhs ? -1 : 0;
498  } else {
499  return 0;
500  }
501 }
502 
503 // Returns lhs + rhs, saturating to the highest/lowest representable value on
504 // overflow/underflow respectively.
505 template <typename Index>
506 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
507  const Index highest = NumTraits<Index>::highest();
508  const Index lowest = NumTraits<Index>::lowest();
509  int overflow = sum_will_overflow(lhs, rhs);
510  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
511 }
512 
513 // A functor that adds step_size to a given index, saturating to avoid
514 // overflow/underflow. If overflow/underflow is not possible, regular addition
515 // is used (for efficiency).
516 template <typename Index>
517 struct SafeStep {
518  // lastIdx is one past the end of the possible indexes.
519  // step_size is the value that will be added to the given index when the
520  // functor is called.
521  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
522  : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {}
523 
524  // Adds step_size to index, saturating on overflow (if overflow is possible).
525  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
526  return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
527  }
528 
529  private:
530  const bool can_overflow_;
531  const Index step_size_;
532 };
533 
534 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
535 struct EigenMetaKernelEval {
536  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
537  StorageIndex step_size) {
538  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
539  for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
540  eval.evalScalar(i);
541  }
542  }
543 };
544 
545 template <typename Evaluator, typename StorageIndex>
546 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
547  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
548  StorageIndex step_size) {
549  const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
550  const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
551  const StorageIndex vectorized_step_size = step_size * PacketSize;
552 
553  SafeStep<StorageIndex> safe_vectorized_step(vectorized_size, vectorized_step_size);
554  // Use the vector path
555  for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) {
556  eval.evalPacket(i);
557  }
558  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
559  for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) {
560  eval.evalScalar(i);
561  }
562  }
563 };
564 
565 template <typename Evaluator, typename StorageIndex>
566 __global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) {
567  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
568  const StorageIndex step_size = blockDim.x * gridDim.x;
569 
570  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
571  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
572 }
573 
574 /*static*/
575 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
576 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(const Expression& expr,
577  const GpuDevice& device) {
578  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
579  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
580  if (needs_assign) {
581  const int block_size = device.maxGpuThreadsPerBlock();
582  const int max_blocks = static_cast<int>(
583  numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
584  NumTraits<StorageIndex>::highest()) /
585  block_size);
586  const StorageIndex size = array_prod(evaluator.dimensions());
587  // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
588  const int num_blocks = numext::maxi<int>(
589  numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
590 
591  LAUNCH_GPU_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size,
592  0, device, evaluator, size);
593  }
594  evaluator.cleanup();
595 }
596 
597 #endif // EIGEN_GPUCC
598 #endif // EIGEN_USE_GPU
599 
600 // SYCL Executor policy
601 #ifdef EIGEN_USE_SYCL
602 
603 template <typename Evaluator>
604 struct ExecExprFunctorKernel {
605  typedef typename Evaluator::Index Index;
606  Evaluator evaluator;
607  const Index range;
608  template <typename Scratch>
609  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_)
610  : evaluator(evaluator_), range(range_) {}
611 
612  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); }
613  template <bool is_vec = Evaluator::PacketAccess>
614  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
615  Index gId = static_cast<Index>(itemID.get_global_linear_id());
616  Index total_threads = itemID.get_global_range(0);
617 
618  for (Index i = gId; i < range; i += total_threads) {
619  evaluator.evalScalar(i);
620  }
621  }
622  template <bool is_vec = Evaluator::PacketAccess>
623  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
624  const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize;
625  Index gId = static_cast<Index>(itemID.get_global_linear_id());
626  const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
627  const Index start = Evaluator::PacketSize * gId;
628  for (Index i = start; i < vectorizedRange; i += step) {
629  evaluator.evalPacket(i);
630  }
631  gId += vectorizedRange;
632  for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
633  evaluator.evalScalar(i);
634  }
635  }
636 };
637 
638 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
639 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
640  public:
641  typedef typename Expression::Index Index;
642  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) {
644  Evaluator evaluator(expr, dev);
645  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
646  if (needs_assign) {
647  Index range, GRange, tileSize;
648  Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
649  total_size = (total_size == 0) ? 1 : total_size;
650  const int PacketSize = Eigen::PacketType<typename Evaluator::CoeffReturnType, Eigen::SyclDevice>::size;
651  Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
652  dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
653  range = total_size;
654 
655  dev.template nullary_kernel_launcher<typename Evaluator::CoeffReturnType, ExecExprFunctorKernel<Evaluator> >(
656  evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
657  range)
658  .wait();
659  }
660  evaluator.cleanup();
661  }
662 };
663 
664 #endif
665 
666 } // end namespace internal
667 
668 } // end namespace Eigen
669 
670 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
The tensor executor class.
Definition: TensorExecutor.h:76
Namespace containing all symbols from the Eigen library.
The tensor evaluator class.
Definition: TensorEvaluator.h:30
Definition: AutoDiffScalar.h:629
Definition: TensorAssign.h:55
Definition: TensorExecutor.h:108
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index