$darkmode
Eigen-unsupported  5.0.1-dev
TensorCostModel.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 // Class storing the cost of evaluating a tensor expression in terms of the
19 // estimated number of operand bytes loads, bytes stored, and compute cycles.
20 class TensorOpCost {
21  public:
22  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
23  // model based on minimal reciprocal throughput numbers from Intel or
24  // Agner Fog's tables would be better than what is there now.
25  template <typename ArgType>
26  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
27  return internal::functor_traits<internal::scalar_product_op<ArgType, ArgType> >::Cost;
28  }
29  template <typename ArgType>
30  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
31  return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
32  }
33  template <typename ArgType>
34  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
35  return internal::functor_traits<internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
36  }
37  template <typename ArgType>
38  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
39  return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
40  }
41  template <typename SrcType, typename TargetType>
42  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
43  return internal::functor_traits<internal::scalar_cast_op<SrcType, TargetType> >::Cost;
44  }
45 
46  EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
47  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
48  : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {}
49 
50  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized,
51  double packet_size)
52  : bytes_loaded_(bytes_loaded),
53  bytes_stored_(bytes_stored),
54  compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) {
55  eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
56  eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
57  eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
58  }
59 
60  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; }
61  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; }
62  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; }
63  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost,
64  double compute_cost) const {
65  return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
66  }
67 
68  // Drop memory access component. Intended for cases when memory accesses are
69  // sequential or are completely masked by computations.
70  EIGEN_DEVICE_FUNC void dropMemoryCost() {
71  bytes_loaded_ = 0;
72  bytes_stored_ = 0;
73  }
74 
75  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
76  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const {
77  double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
78  double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
79  double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
80  return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
81  }
82 
83  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
84  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const {
85  double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
86  double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
87  double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
88  return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
89  }
90 
91  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs) {
92  bytes_loaded_ += rhs.bytes_loaded();
93  bytes_stored_ += rhs.bytes_stored();
94  compute_cycles_ += rhs.compute_cycles();
95  return *this;
96  }
97 
98  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
99  bytes_loaded_ *= rhs;
100  bytes_stored_ *= rhs;
101  compute_cycles_ *= rhs;
102  return *this;
103  }
104 
105  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs) {
106  lhs += rhs;
107  return lhs;
108  }
109  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs) {
110  lhs *= rhs;
111  return lhs;
112  }
113  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs) {
114  rhs *= lhs;
115  return rhs;
116  }
117 
118  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
119  return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored()
120  << ", compute_cycles = " << tc.compute_cycles() << "]";
121  }
122 
123  private:
124  double bytes_loaded_;
125  double bytes_stored_;
126  double compute_cycles_;
127 };
128 
129 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
130 // in [1:max_threads] instead of just switching multi-threading off for small
131 // work units.
139 template <typename Device>
141  public:
142  // Scaling from Eigen compute cost to device cycles.
143  static const int kDeviceCyclesPerComputeCycle = 1;
144 
145  // Costs in device cycles.
146  static const int kStartupCycles = 100000;
147  static const int kPerThreadCycles = 100000;
148  static const int kTaskSize = 40000;
149 
150  // Returns the number of threads in [1:max_threads] to use for
151  // evaluating an expression with the given output size and cost per
152  // coefficient.
153  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff,
154  int max_threads) {
155  double cost = totalCost(output_size, cost_per_coeff);
156  double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
157  // Make sure we don't invoke undefined behavior when we convert to an int.
158  threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
159  return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
160  }
161 
162  // taskSize assesses parallel task size.
163  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
164  // granularity needs to be increased to mitigate parallelization overheads.
165  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) {
166  return totalCost(output_size, cost_per_coeff) / kTaskSize;
167  }
168 
169  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size,
170  const TensorOpCost& cost_per_coeff) {
171  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
172  // 11 is L2 cache latency on Haswell.
173  // We don't know whether data is in L1, L2 or L3. But we are most interested
174  // in single-threaded computational time around 100us-10ms (smaller time
175  // is too small for parallelization, larger time is not interesting
176  // either because we are probably using all available threads already).
177  // And for the target time range, L2 seems to be what matters. Data set
178  // fitting into L1 is too small to take noticeable time. Data set fitting
179  // only into L3 presumably will take more than 10ms to load and process.
180  const double kLoadCycles = 1.0 / 64 * 11;
181  const double kStoreCycles = 1.0 / 64 * 11;
182  // Scaling from Eigen compute cost to device cycles.
183  return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
184  }
185 };
186 
187 } // namespace Eigen
188 
189 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
Namespace containing all symbols from the Eigen library.
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorCostModel.h:140
const Product< SparseDerived, PermDerived, AliasFreeProduct > operator*(const SparseMatrixBase< SparseDerived > &matrix, const PermutationBase< PermDerived > &perm)