10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 30 template <
typename ArgType>
31 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int MulCost() {
32 return internal::functor_traits<
33 internal::scalar_product_op<ArgType, ArgType> >::Cost;
35 template <
typename ArgType>
36 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int AddCost() {
37 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
39 template <
typename ArgType>
40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int DivCost() {
41 return internal::functor_traits<
42 internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
44 template <
typename ArgType>
45 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int ModCost() {
46 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
48 template <
typename SrcType,
typename TargetType>
49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int CastCost() {
50 return internal::functor_traits<
51 internal::scalar_cast_op<SrcType, TargetType> >::Cost;
54 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
55 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles)
56 : bytes_loaded_(bytes_loaded),
57 bytes_stored_(bytes_stored),
58 compute_cycles_(compute_cycles) {}
60 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles,
61 bool vectorized,
double packet_size)
62 : bytes_loaded_(bytes_loaded),
63 bytes_stored_(bytes_stored),
64 compute_cycles_(vectorized ? compute_cycles / packet_size
66 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
67 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
68 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
71 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_loaded()
const {
74 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_stored()
const {
77 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double compute_cycles()
const {
78 return compute_cycles_;
80 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double total_cost(
81 double load_cost,
double store_cost,
double compute_cost)
const {
82 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
83 compute_cost * compute_cycles_;
88 EIGEN_DEVICE_FUNC
void dropMemoryCost() {
94 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
95 const TensorOpCost& rhs) {
96 bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
97 bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
98 compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
103 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
104 const TensorOpCost& rhs) {
105 bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
106 bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
107 compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
111 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
112 const TensorOpCost& rhs) {
113 bytes_loaded_ += rhs.bytes_loaded();
114 bytes_stored_ += rhs.bytes_stored();
115 compute_cycles_ += rhs.compute_cycles();
119 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(
double rhs) {
120 bytes_loaded_ *= rhs;
121 bytes_stored_ *= rhs;
122 compute_cycles_ *= rhs;
126 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator+(
127 TensorOpCost lhs,
const TensorOpCost& rhs) {
131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
132 TensorOpCost lhs,
double rhs) {
136 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
137 double lhs, TensorOpCost rhs) {
142 friend std::ostream& operator<<(std::ostream& os,
const TensorOpCost& tc) {
143 return os <<
"[bytes_loaded = " << tc.bytes_loaded()
144 <<
", bytes_stored = " << tc.bytes_stored()
145 <<
", compute_cycles = " << tc.compute_cycles() <<
"]";
149 double bytes_loaded_;
150 double bytes_stored_;
151 double compute_cycles_;
157 template <
typename Device>
158 class TensorCostModel {
161 static const int kDeviceCyclesPerComputeCycle = 1;
164 static const int kStartupCycles = 100000;
165 static const int kPerThreadCycles = 100000;
166 static const int kTaskSize = 40000;
171 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int numThreads(
172 double output_size,
const TensorOpCost& cost_per_coeff,
int max_threads) {
173 double cost = totalCost(output_size, cost_per_coeff);
174 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
175 return numext::mini(max_threads, numext::maxi(1, threads));
181 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double taskSize(
182 double output_size,
const TensorOpCost& cost_per_coeff) {
183 return totalCost(output_size, cost_per_coeff) / kTaskSize;
187 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double totalCost(
188 double output_size,
const TensorOpCost& cost_per_coeff) {
198 const double kLoadCycles = 1.0 / 64 * 11;
199 const double kStoreCycles = 1.0 / 64 * 11;
202 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
203 kDeviceCyclesPerComputeCycle);
209 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45