TensorDeviceCuda.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
12 
13 namespace Eigen {
14 
15 static const int kCudaScratchSize = 1024;
16 
17 // This defines an interface that GPUDevice can take to use
18 // CUDA streams underneath.
19 class StreamInterface {
20  public:
21  virtual ~StreamInterface() {}
22 
23  virtual const cudaStream_t& stream() const = 0;
24  virtual const cudaDeviceProp& deviceProperties() const = 0;
25 
26  // Allocate memory on the actual device where the computation will run
27  virtual void* allocate(size_t num_bytes) const = 0;
28  virtual void deallocate(void* buffer) const = 0;
29 
30  // Return a scratchpad buffer of size 1k
31  virtual void* scratchpad() const = 0;
32 
33  // Return a semaphore. The semaphore is initially initialized to 0, and
34  // each kernel using it is responsible for resetting to 0 upon completion
35  // to maintain the invariant that the semaphore is always equal to 0 upon
36  // each kernel start.
37  virtual unsigned int* semaphore() const = 0;
38 };
39 
40 static cudaDeviceProp* m_deviceProperties;
41 static bool m_devicePropInitialized = false;
42 
43 static void initializeDeviceProp() {
44  if (!m_devicePropInitialized) {
45  if (!m_devicePropInitialized) {
46  int num_devices;
47  cudaError_t status = cudaGetDeviceCount(&num_devices);
48  if (status != cudaSuccess) {
49  std::cerr << "Failed to get the number of CUDA devices: "
50  << cudaGetErrorString(status)
51  << std::endl;
52  assert(status == cudaSuccess);
53  }
54  m_deviceProperties = new cudaDeviceProp[num_devices];
55  for (int i = 0; i < num_devices; ++i) {
56  status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
57  if (status != cudaSuccess) {
58  std::cerr << "Failed to initialize CUDA device #"
59  << i
60  << ": "
61  << cudaGetErrorString(status)
62  << std::endl;
63  assert(status == cudaSuccess);
64  }
65  }
66  m_devicePropInitialized = true;
67  }
68  }
69 }
70 
71 static const cudaStream_t default_stream = cudaStreamDefault;
72 
73 class CudaStreamDevice : public StreamInterface {
74  public:
75  // Use the default stream on the current device
76  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
77  cudaGetDevice(&device_);
78  initializeDeviceProp();
79  }
80  // Use the default stream on the specified device
81  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
82  initializeDeviceProp();
83  }
84  // Use the specified stream. Note that it's the
85  // caller responsibility to ensure that the stream can run on
86  // the specified device. If no device is specified the code
87  // assumes that the stream is associated to the current gpu device.
88  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
89  : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
90  if (device < 0) {
91  cudaGetDevice(&device_);
92  } else {
93  int num_devices;
94  cudaError_t err = cudaGetDeviceCount(&num_devices);
95  EIGEN_UNUSED_VARIABLE(err)
96  assert(err == cudaSuccess);
97  assert(device < num_devices);
98  device_ = device;
99  }
100  initializeDeviceProp();
101  }
102 
103  virtual ~CudaStreamDevice() {
104  if (scratch_) {
105  deallocate(scratch_);
106  }
107  }
108 
109  const cudaStream_t& stream() const { return *stream_; }
110  const cudaDeviceProp& deviceProperties() const {
111  return m_deviceProperties[device_];
112  }
113  virtual void* allocate(size_t num_bytes) const {
114  cudaError_t err = cudaSetDevice(device_);
115  EIGEN_UNUSED_VARIABLE(err)
116  assert(err == cudaSuccess);
117  void* result;
118  err = cudaMalloc(&result, num_bytes);
119  assert(err == cudaSuccess);
120  assert(result != NULL);
121  return result;
122  }
123  virtual void deallocate(void* buffer) const {
124  cudaError_t err = cudaSetDevice(device_);
125  EIGEN_UNUSED_VARIABLE(err)
126  assert(err == cudaSuccess);
127  assert(buffer != NULL);
128  err = cudaFree(buffer);
129  assert(err == cudaSuccess);
130  }
131 
132  virtual void* scratchpad() const {
133  if (scratch_ == NULL) {
134  scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
135  }
136  return scratch_;
137  }
138 
139  virtual unsigned int* semaphore() const {
140  if (semaphore_ == NULL) {
141  char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
142  semaphore_ = reinterpret_cast<unsigned int*>(scratch);
143  cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
144  EIGEN_UNUSED_VARIABLE(err)
145  assert(err == cudaSuccess);
146  }
147  return semaphore_;
148  }
149 
150  private:
151  const cudaStream_t* stream_;
152  int device_;
153  mutable void* scratch_;
154  mutable unsigned int* semaphore_;
155 };
156 
157 struct GpuDevice {
158  // The StreamInterface is not owned: the caller is
159  // responsible for its initialization and eventual destruction.
160  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
161  eigen_assert(stream);
162  }
163  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
164  eigen_assert(stream);
165  }
166  // TODO(bsteiner): This is an internal API, we should not expose it.
167  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
168  return stream_->stream();
169  }
170 
171  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
172 #ifndef __CUDA_ARCH__
173  return stream_->allocate(num_bytes);
174 #else
175  eigen_assert(false && "The default device should be used instead to generate kernel code");
176  return NULL;
177 #endif
178  }
179 
180  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
181 #ifndef __CUDA_ARCH__
182  stream_->deallocate(buffer);
183 #else
184  eigen_assert(false && "The default device should be used instead to generate kernel code");
185 #endif
186  }
187 
188  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
189 #ifndef __CUDA_ARCH__
190  return stream_->scratchpad();
191 #else
192  eigen_assert(false && "The default device should be used instead to generate kernel code");
193  return NULL;
194 #endif
195  }
196 
197  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
198 #ifndef __CUDA_ARCH__
199  return stream_->semaphore();
200 #else
201  eigen_assert(false && "The default device should be used instead to generate kernel code");
202  return NULL;
203 #endif
204  }
205 
206  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
207 #ifndef __CUDA_ARCH__
208  cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
209  stream_->stream());
210  EIGEN_UNUSED_VARIABLE(err)
211  assert(err == cudaSuccess);
212 #else
213  eigen_assert(false && "The default device should be used instead to generate kernel code");
214 #endif
215  }
216 
217  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
218 #ifndef __CUDA_ARCH__
219  cudaError_t err =
220  cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
221  EIGEN_UNUSED_VARIABLE(err)
222  assert(err == cudaSuccess);
223 #else
224  eigen_assert(false && "The default device should be used instead to generate kernel code");
225 #endif
226  }
227 
228  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
229 #ifndef __CUDA_ARCH__
230  cudaError_t err =
231  cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
232  EIGEN_UNUSED_VARIABLE(err)
233  assert(err == cudaSuccess);
234 #else
235  eigen_assert(false && "The default device should be used instead to generate kernel code");
236 #endif
237  }
238 
239  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
240 #ifndef __CUDA_ARCH__
241  cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
242  EIGEN_UNUSED_VARIABLE(err)
243  assert(err == cudaSuccess);
244 #else
245  eigen_assert(false && "The default device should be used instead to generate kernel code");
246 #endif
247  }
248 
249  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
250  // FIXME
251  return 32;
252  }
253 
254  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
255  // FIXME
256  return 48*1024;
257  }
258 
259  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
260  // We won't try to take advantage of the l2 cache for the time being, and
261  // there is no l3 cache on cuda devices.
262  return firstLevelCacheSize();
263  }
264 
265  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
266 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
267  cudaError_t err = cudaStreamSynchronize(stream_->stream());
268  if (err != cudaSuccess) {
269  std::cerr << "Error detected in CUDA stream: "
270  << cudaGetErrorString(err)
271  << std::endl;
272  assert(err == cudaSuccess);
273  }
274 #else
275  assert(false && "The default device should be used instead to generate kernel code");
276 #endif
277  }
278 
279  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
280 #ifndef __CUDA_ARCH__
281  return stream_->deviceProperties().multiProcessorCount;
282 #else
283  eigen_assert(false && "The default device should be used instead to generate kernel code");
284  return 0;
285 #endif
286  }
287  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
288 #ifndef __CUDA_ARCH__
289  return stream_->deviceProperties().maxThreadsPerBlock;
290 #else
291  eigen_assert(false && "The default device should be used instead to generate kernel code");
292  return 0;
293 #endif
294  }
295  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
296 #ifndef __CUDA_ARCH__
297  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
298 #else
299  eigen_assert(false && "The default device should be used instead to generate kernel code");
300  return 0;
301 #endif
302  }
303  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
304 #ifndef __CUDA_ARCH__
305  return stream_->deviceProperties().sharedMemPerBlock;
306 #else
307  eigen_assert(false && "The default device should be used instead to generate kernel code");
308  return 0;
309 #endif
310  }
311  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
312 #ifndef __CUDA_ARCH__
313  return stream_->deviceProperties().major;
314 #else
315  eigen_assert(false && "The default device should be used instead to generate kernel code");
316  return 0;
317 #endif
318  }
319  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
320 #ifndef __CUDA_ARCH__
321  return stream_->deviceProperties().minor;
322 #else
323  eigen_assert(false && "The default device should be used instead to generate kernel code");
324  return 0;
325 #endif
326  }
327 
328  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
329  return max_blocks_;
330  }
331 
332  // This function checks if the CUDA runtime recorded an error for the
333  // underlying stream device.
334  inline bool ok() const {
335 #ifdef __CUDACC__
336  cudaError_t error = cudaStreamQuery(stream_->stream());
337  return (error == cudaSuccess) || (error == cudaErrorNotReady);
338 #else
339  return false;
340 #endif
341  }
342 
343  private:
344  const StreamInterface* stream_;
345  int max_blocks_;
346 };
347 
348 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
349  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
350  assert(cudaGetLastError() == cudaSuccess);
351 
352 
353 // FIXME: Should be device and kernel specific.
354 #ifdef __CUDACC__
355 static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
356 #ifndef __CUDA_ARCH__
357  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
358  EIGEN_UNUSED_VARIABLE(status)
359  assert(status == cudaSuccess);
360 #else
361  EIGEN_UNUSED_VARIABLE(config)
362 #endif
363 }
364 #endif
365 
366 } // end namespace Eigen
367 
368 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45