TensorCostModel.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12 
13 namespace Eigen {
14 
23 // Class storing the cost of evaluating a tensor expression in terms of the
24 // estimated number of operand bytes loads, bytes stored, and compute cycles.
25 class TensorOpCost {
26  public:
27  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
28  // model based on minimal reciprocal throughput numbers from Intel or
29  // Agner Fog's tables would be better than what is there now.
30  template <typename ArgType>
31  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
32  return internal::functor_traits<
33  internal::scalar_product_op<ArgType, ArgType> >::Cost;
34  }
35  template <typename ArgType>
36  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
37  return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
38  }
39  template <typename ArgType>
40  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
41  return internal::functor_traits<
42  internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
43  }
44  template <typename ArgType>
45  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
46  return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
47  }
48  template <typename SrcType, typename TargetType>
49  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
50  return internal::functor_traits<
51  internal::scalar_cast_op<SrcType, TargetType> >::Cost;
52  }
53 
54  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
55  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
56  : bytes_loaded_(bytes_loaded),
57  bytes_stored_(bytes_stored),
58  compute_cycles_(compute_cycles) {}
59 
60  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
61  bool vectorized, double packet_size)
62  : bytes_loaded_(bytes_loaded),
63  bytes_stored_(bytes_stored),
64  compute_cycles_(vectorized ? compute_cycles / packet_size
65  : compute_cycles) {
66  eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
67  eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
68  eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
69  }
70 
71  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
72  return bytes_loaded_;
73  }
74  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
75  return bytes_stored_;
76  }
77  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
78  return compute_cycles_;
79  }
80  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
81  double load_cost, double store_cost, double compute_cost) const {
82  return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
83  compute_cost * compute_cycles_;
84  }
85 
86  // Drop memory access component. Intended for cases when memory accesses are
87  // sequential or are completely masked by computations.
88  EIGEN_DEVICE_FUNC void dropMemoryCost() {
89  bytes_loaded_ = 0;
90  bytes_stored_ = 0;
91  }
92 
93  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
94  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
95  const TensorOpCost& rhs) {
96  bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
97  bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
98  compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
99  return *this;
100  }
101 
102  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
103  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
104  const TensorOpCost& rhs) {
105  bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
106  bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
107  compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
108  return *this;
109  }
110 
111  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
112  const TensorOpCost& rhs) {
113  bytes_loaded_ += rhs.bytes_loaded();
114  bytes_stored_ += rhs.bytes_stored();
115  compute_cycles_ += rhs.compute_cycles();
116  return *this;
117  }
118 
119  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
120  bytes_loaded_ *= rhs;
121  bytes_stored_ *= rhs;
122  compute_cycles_ *= rhs;
123  return *this;
124  }
125 
126  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
127  TensorOpCost lhs, const TensorOpCost& rhs) {
128  lhs += rhs;
129  return lhs;
130  }
131  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
132  TensorOpCost lhs, double rhs) {
133  lhs *= rhs;
134  return lhs;
135  }
136  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
137  double lhs, TensorOpCost rhs) {
138  rhs *= lhs;
139  return rhs;
140  }
141 
142  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
143  return os << "[bytes_loaded = " << tc.bytes_loaded()
144  << ", bytes_stored = " << tc.bytes_stored()
145  << ", compute_cycles = " << tc.compute_cycles() << "]";
146  }
147 
148  private:
149  double bytes_loaded_;
150  double bytes_stored_;
151  double compute_cycles_;
152 };
153 
154 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
155 // in [1:max_threads] instead of just switching multi-threading off for small
156 // work units.
157 template <typename Device>
158 class TensorCostModel {
159  public:
160  // Scaling from Eigen compute cost to device cycles.
161  static const int kDeviceCyclesPerComputeCycle = 1;
162 
163  // Costs in device cycles.
164  static const int kStartupCycles = 100000;
165  static const int kPerThreadCycles = 100000;
166  static const int kTaskSize = 40000;
167 
168  // Returns the number of threads in [1:max_threads] to use for
169  // evaluating an expression with the given output size and cost per
170  // coefficient.
171  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
172  double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
173  double cost = totalCost(output_size, cost_per_coeff);
174  int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
175  return numext::mini(max_threads, numext::maxi(1, threads));
176  }
177 
178  // taskSize assesses parallel task size.
179  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
180  // granularity needs to be increased to mitigate parallelization overheads.
181  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
182  double output_size, const TensorOpCost& cost_per_coeff) {
183  return totalCost(output_size, cost_per_coeff) / kTaskSize;
184  }
185 
186  private:
187  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
188  double output_size, const TensorOpCost& cost_per_coeff) {
189  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
190  // 11 is L2 cache latency on Haswell.
191  // We don't know whether data is in L1, L2 or L3. But we are most interested
192  // in single-threaded computational time around 100us-10ms (smaller time
193  // is too small for parallelization, larger time is not intersting
194  // either because we are probably using all available threads already).
195  // And for the target time range, L2 seems to be what matters. Data set
196  // fitting into L1 is too small to take noticeable time. Data set fitting
197  // only into L3 presumably will take more than 10ms to load and process.
198  const double kLoadCycles = 1.0 / 64 * 11;
199  const double kStoreCycles = 1.0 / 64 * 11;
200  // Scaling from Eigen compute cost to device cycles.
201  return output_size *
202  cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
203  kDeviceCyclesPerComputeCycle);
204  }
205 };
206 
207 } // namespace Eigen
208 
209 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45