TensorExecutor.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 namespace Eigen {
14 
23 namespace internal {
24 
25 // Default strategy: the expression is evaluated with a single cpu thread.
26 template<typename Expression, typename Device, bool Vectorizable>
27 class TensorExecutor
28 {
29  public:
30  typedef typename Expression::Index Index;
31  EIGEN_DEVICE_FUNC
32  static inline void run(const Expression& expr, const Device& device = Device())
33  {
34  TensorEvaluator<Expression, Device> evaluator(expr, device);
35  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
36  if (needs_assign)
37  {
38  const Index size = array_prod(evaluator.dimensions());
39  for (Index i = 0; i < size; ++i) {
40  evaluator.evalScalar(i);
41  }
42  }
43  evaluator.cleanup();
44  }
45 };
46 
47 
48 template<typename Expression>
49 class TensorExecutor<Expression, DefaultDevice, true>
50 {
51  public:
52  typedef typename Expression::Index Index;
53  EIGEN_DEVICE_FUNC
54  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
55  {
56  TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
57  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
58  if (needs_assign)
59  {
60  const Index size = array_prod(evaluator.dimensions());
61  const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
62  const Index VectorizedSize = (size / PacketSize) * PacketSize;
63 
64  for (Index i = 0; i < VectorizedSize; i += PacketSize) {
65  evaluator.evalPacket(i);
66  }
67  for (Index i = VectorizedSize; i < size; ++i) {
68  evaluator.evalScalar(i);
69  }
70  }
71  evaluator.cleanup();
72  }
73 };
74 
75 
76 
77 // Multicore strategy: the index space is partitioned and each partition is executed on a single core
78 #ifdef EIGEN_USE_THREADS
79 template <typename Evaluator, typename Index, bool Vectorizable>
80 struct EvalRange {
81  static void run(Evaluator evaluator, const Index first, const Index last) {
82  eigen_assert(last > first);
83  for (Index i = first; i < last; ++i) {
84  evaluator.evalScalar(i);
85  }
86  }
87 };
88 
89 template <typename Evaluator, typename Index>
90 struct EvalRange<Evaluator, Index, true> {
91  static void run(Evaluator evaluator, const Index first, const Index last) {
92  eigen_assert(last > first);
93 
94  Index i = first;
95  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
96  if (last - first >= PacketSize) {
97  eigen_assert(first % PacketSize == 0);
98  Index lastPacket = last - (last % PacketSize);
99  for (; i < lastPacket; i += PacketSize) {
100  evaluator.evalPacket(i);
101  }
102  }
103 
104  for (; i < last; ++i) {
105  evaluator.evalScalar(i);
106  }
107  }
108 };
109 
110 template<typename Expression, bool Vectorizable>
111 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
112 {
113  public:
114  typedef typename Expression::Index Index;
115  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
116  {
117  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
118  Evaluator evaluator(expr, device);
119  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
120  if (needs_assign)
121  {
122  const Index size = array_prod(evaluator.dimensions());
123 
124  static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
125 
126  int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
127  const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
128  const Index numblocks = size / blocksize;
129 
130  std::vector<Notification*> results;
131  results.reserve(numblocks);
132  for (int i = 0; i < numblocks; ++i) {
133  results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
134  }
135 
136  if (numblocks * blocksize < size) {
137  EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
138  }
139 
140  for (int i = 0; i < numblocks; ++i) {
141  wait_until_ready(results[i]);
142  delete results[i];
143  }
144 
145  }
146  evaluator.cleanup();
147  }
148 };
149 #endif
150 
151 
152 // GPU: the evaluation of the expression is offloaded to a GPU.
153 #if defined(EIGEN_USE_GPU)
154 
155 template <typename Expression>
156 class TensorExecutor<Expression, GpuDevice, false> {
157  public:
158  typedef typename Expression::Index Index;
159  static void run(const Expression& expr, const GpuDevice& device);
160 };
161 
162 template <typename Expression>
163 class TensorExecutor<Expression, GpuDevice, true> {
164  public:
165  typedef typename Expression::Index Index;
166  static void run(const Expression& expr, const GpuDevice& device);
167 };
168 
169 #if defined(__CUDACC__)
170 
171 template <typename Evaluator, typename Index>
172 __global__ void
173 __launch_bounds__(1024)
174 EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
175  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
176  // complex types such as evaluators we should really conform to the C++
177  // standard and call a proper copy constructor.
178  Evaluator eval(memcopied_eval);
179 
180  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
181  const Index step_size = blockDim.x * gridDim.x;
182 
183  // Use the scalar path
184  for (Index i = first_index; i < size; i += step_size) {
185  eval.evalScalar(i);
186  }
187 }
188 
189 template <typename Evaluator, typename Index>
190 __global__ void
191 __launch_bounds__(1024)
192 EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
193  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
194  // complex types such as evaluators we should really conform to the C++
195  // standard and call a proper copy constructor.
196  Evaluator eval(memcopied_eval);
197 
198  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
199  const Index step_size = blockDim.x * gridDim.x;
200 
201  // Use the vector path
202  const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
203  const Index vectorized_step_size = step_size * PacketSize;
204  const Index vectorized_size = (size / PacketSize) * PacketSize;
205  for (Index i = first_index * PacketSize; i < vectorized_size;
206  i += vectorized_step_size) {
207  eval.evalPacket(i);
208  }
209  for (Index i = vectorized_size + first_index; i < size; i += step_size) {
210  eval.evalScalar(i);
211  }
212 }
213 
214 /*static*/
215 template <typename Expression>
216 inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
217 {
218  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
219  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
220  if (needs_assign)
221  {
222  const int block_size = device.maxCudaThreadsPerBlock();
223  const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
224  const Index size = array_prod(evaluator.dimensions());
225  // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
226  const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
227  LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
228  }
229  evaluator.cleanup();
230 }
231 
232 
233 /*static*/
234 template<typename Expression>
235 inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
236 {
237  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
238  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
239  if (needs_assign)
240  {
241  const int block_size = device.maxCudaThreadsPerBlock();
242  const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
243  const Index size = array_prod(evaluator.dimensions());
244  // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
245  const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
246  LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
247  }
248  evaluator.cleanup();
249 }
250 
251 #endif // __CUDACC__
252 #endif // EIGEN_USE_GPU
253 
254 } // end namespace internal
255 
256 } // end namespace Eigen
257 
258 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13
The tensor executor class.