10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 24 template <
typename T,
typename R>
25 __device__ EIGEN_ALWAYS_INLINE
void atomicReduce(T* output, T accum, R& reducer) {
26 #if __CUDA_ARCH__ >= 300 29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
63 assert(0 &&
"Wordsize not supported");
66 assert(0 &&
"Shouldn't be called on unsupported device");
71 #ifdef EIGEN_HAS_CUDA_FP16 72 template <
template <
typename T>
class R>
73 __device__
inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
74 #if __CUDA_ARCH__ >= 300 75 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
76 unsigned int newval = oldval;
77 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
78 if (newval == oldval) {
81 unsigned int readback;
82 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
85 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
86 if (newval == oldval) {
91 assert(0 &&
"Shouldn't be called on unsupported device");
97 __device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
98 #if __CUDA_ARCH__ >= 300 99 atomicAdd(output, accum);
101 assert(0 &&
"Shouldn't be called on unsupported device");
106 template <
typename CoeffType,
typename Index>
107 __global__
void ReductionInitKernel(
const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
108 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
109 const Index num_threads = blockDim.x * gridDim.x;
110 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
116 template <
int BlockSize,
int NumPerThread,
typename Self,
117 typename Reducer,
typename Index>
118 __global__
void FullReductionKernel(Reducer reducer,
const Self input, Index num_coeffs,
119 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
120 #if __CUDA_ARCH__ >= 300 122 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
123 if (gridDim.x == 1) {
124 if (first_index == 0) {
125 *output = reducer.initialize();
129 if (threadIdx.x == 0) {
130 unsigned int block = atomicCAS(semaphore, 0u, 1u);
133 atomicExch(output, reducer.initialize());
135 atomicExch(semaphore, 2u);
142 val = atomicCAS(semaphore, 2u, 2u);
151 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
153 typename Self::CoeffReturnType accum = reducer.initialize();
154 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
155 for (Index i = 0; i < max_iter; i+=BlockSize) {
156 const Index index = first_index + i;
157 eigen_assert(index < num_coeffs);
158 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
159 reducer.reduce(val, &accum);
163 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
164 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
167 if ((threadIdx.x & (warpSize - 1)) == 0) {
168 atomicReduce(output, accum, reducer);
171 if (gridDim.x > 1 && threadIdx.x == 0) {
173 atomicInc(semaphore, gridDim.x + 1);
176 assert(0 &&
"Shouldn't be called on unsupported device");
181 #ifdef EIGEN_HAS_CUDA_FP16 182 template <
typename Self,
183 typename Reducer,
typename Index>
184 __global__
void ReductionInitFullReduxKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs, half2* scratch) {
185 eigen_assert(blockDim.x == 1);
186 eigen_assert(gridDim.x == 1);
187 if (num_coeffs % 2 != 0) {
188 half last = input.m_impl.coeff(num_coeffs-1);
189 *scratch = __halves2half2(last, reducer.initialize());
191 *scratch = reducer.template initializePacket<half2>();
195 template <
typename Self,
196 typename Reducer,
typename Index>
197 __global__
void ReductionInitKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs, half* output) {
198 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
199 const Index num_threads = blockDim.x * gridDim.x;
200 const Index num_packets = num_coeffs / 2;
201 for (Index i = thread_id; i < num_packets; i += num_threads) {
202 ((half2*)output)[i] = reducer.template initializePacket<half2>();
205 if (thread_id == 0 && num_coeffs % 2 != 0) {
206 output[num_coeffs-1] = reducer.initialize();
210 template <
int BlockSize,
int NumPerThread,
typename Self,
211 typename Reducer,
typename Index>
212 __global__
void FullReductionKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs,
213 half* output, half2* scratch) {
214 eigen_assert(NumPerThread % 2 == 0);
216 const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
219 if (gridDim.x == 1 && first_index == 0) {
220 if (num_coeffs % 2 != 0) {
221 half last = input.m_impl.coeff(num_coeffs-1);
222 *scratch = __halves2half2(last, reducer.initialize());
224 *scratch = reducer.template initializePacket<half2>();
229 half2 accum = reducer.template initializePacket<half2>();
230 const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
231 for (Index i = 0; i < max_iter; i += BlockSize) {
232 const Index index = first_index + 2*i;
233 eigen_assert(index + 1 < num_coeffs);
234 half2 val = input.m_impl.template packet<Unaligned>(index);
235 reducer.reducePacket(val, &accum);
239 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
240 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
243 if ((threadIdx.x & (warpSize - 1)) == 0) {
244 atomicReduce(scratch, accum, reducer);
249 if (gridDim.x == 1 && first_index == 0) {
250 half tmp = __low2half(*scratch);
251 reducer.reduce(__high2half(*scratch), &tmp);
256 template <
typename Op>
257 __global__
void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
258 eigen_assert(threadIdx.x == 1);
259 half tmp = __low2half(*scratch);
260 reducer.reduce(__high2half(*scratch), &tmp);
267 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
268 struct FullReductionLauncher {
269 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
270 assert(
false &&
"Should only be called on floats and half floats");
274 template <
typename Self,
typename Op,
bool PacketAccess>
275 struct FullReductionLauncher<Self, Op, float, PacketAccess> {
276 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs) {
277 typedef typename Self::Index Index;
278 typedef typename Self::CoeffReturnType Scalar;
279 const int block_size = 256;
280 const int num_per_thread = 128;
281 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
283 unsigned int* semaphore = NULL;
284 if (num_blocks > 1) {
285 semaphore = device.semaphore();
288 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
289 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, semaphore);
293 #ifdef EIGEN_HAS_CUDA_FP16 294 template <
typename Self,
typename Op>
295 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
296 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
297 assert(
false &&
"Should not be called since there is no packet accessor");
301 template <
typename Self,
typename Op>
302 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
303 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
304 typedef typename Self::Index Index;
306 const int block_size = 256;
307 const int num_per_thread = 128;
308 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
309 half2* scratch =
static_cast<half2*
>(device.scratchpad());
311 if (num_blocks > 1) {
314 LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
315 1, 1, 0, device, reducer,
self, num_coeffs, scratch);
318 LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
319 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, scratch);
321 if (num_blocks > 1) {
322 LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
323 1, 1, 0, device, reducer, output, scratch);
330 template <
typename Self,
typename Op,
bool Vectorizable>
331 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
335 #ifdef EIGEN_HAS_CUDA_FP16 336 static const bool HasOptimizedImplementation = !Op::IsStateful &&
337 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
338 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
339 #elif __CUDA_ARCH__ >= 300 340 static const bool HasOptimizedImplementation = !Op::IsStateful &&
341 internal::is_same<typename Self::CoeffReturnType, float>::value;
343 static const bool HasOptimizedImplementation =
false;
346 template <
typename OutputType>
347 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
348 assert(HasOptimizedImplementation &&
"Should only be called on floats or half floats");
349 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
351 if (num_coeffs == 0) {
355 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
self, reducer, device, output, num_coeffs);
360 template <
int NumPerThread,
typename Self,
361 typename Reducer,
typename Index>
362 __global__
void InnerReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
363 typename Self::CoeffReturnType* output) {
364 #if __CUDA_ARCH__ >= 300 365 eigen_assert(blockDim.y == 1);
366 eigen_assert(blockDim.z == 1);
367 eigen_assert(gridDim.y == 1);
368 eigen_assert(gridDim.z == 1);
370 const int unroll_times = 16;
371 eigen_assert(NumPerThread % unroll_times == 0);
373 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
374 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
376 const Index num_threads = blockDim.x * gridDim.x;
377 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
380 if (gridDim.x == 1) {
381 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
382 output[i] = reducer.initialize();
387 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
388 const Index row = i / input_col_blocks;
390 if (row < num_preserved_coeffs) {
391 const Index col_block = i % input_col_blocks;
392 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
394 float reduced_val = reducer.initialize();
396 for (Index j = 0; j < NumPerThread; j += unroll_times) {
397 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
398 if (last_col >= num_coeffs_to_reduce) {
399 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
400 const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
401 reducer.reduce(val, &reduced_val);
407 for (
int k = 0; k < unroll_times; ++k) {
408 const Index col = col_begin + blockDim.x * (j + k);
409 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
415 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
416 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
419 if ((threadIdx.x & (warpSize - 1)) == 0) {
420 atomicReduce(&(output[row]), reduced_val, reducer);
425 assert(0 &&
"Shouldn't be called on unsupported device");
429 #ifdef EIGEN_HAS_CUDA_FP16 431 template <
int NumPerThread,
typename Self,
432 typename Reducer,
typename Index>
433 __global__
void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
435 eigen_assert(blockDim.y == 1);
436 eigen_assert(blockDim.z == 1);
437 eigen_assert(gridDim.y == 1);
438 eigen_assert(gridDim.z == 1);
440 const int unroll_times = 16;
441 eigen_assert(NumPerThread % unroll_times == 0);
442 eigen_assert(unroll_times % 2 == 0);
444 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
445 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
447 const Index num_threads = blockDim.x * gridDim.x;
448 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
451 if (gridDim.x == 1) {
452 Index i = 2*thread_id;
453 for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
454 half* loc = output + i;
455 *((half2*)loc) = reducer.template initializePacket<half2>();
457 if (i < num_preserved_coeffs) {
458 output[i] = reducer.initialize();
463 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
464 const Index row = 2 * (i / input_col_blocks);
466 if (row + 1 < num_preserved_coeffs) {
467 const Index col_block = i % input_col_blocks;
468 const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
470 half2 reduced_val1 = reducer.template initializePacket<half2>();
471 half2 reduced_val2 = reducer.template initializePacket<half2>();
473 for (Index j = 0; j < NumPerThread; j += unroll_times) {
474 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
475 if (last_col >= num_coeffs_to_reduce) {
476 Index col = col_begin + blockDim.x * j;
477 for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
478 const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
479 reducer.reducePacket(val1, &reduced_val1);
480 const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
481 reducer.reducePacket(val2, &reduced_val2);
483 if (col < num_coeffs_to_reduce) {
485 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
486 const half2 val1 = __halves2half2(last1, reducer.initialize());
487 reducer.reducePacket(val1, &reduced_val1);
488 const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
489 const half2 val2 = __halves2half2(last2, reducer.initialize());
490 reducer.reducePacket(val2, &reduced_val2);
496 for (
int k = 0; k < unroll_times; ++k) {
497 const Index col = col_begin + blockDim.x * (j + k) * 2;
498 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
499 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
505 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
506 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
507 reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
510 half val1 = __low2half(reduced_val1);
511 reducer.reduce(__high2half(reduced_val1), &val1);
512 half val2 = __low2half(reduced_val2);
513 reducer.reduce(__high2half(reduced_val2), &val2);
514 half2 val = __halves2half2(val1, val2);
516 if ((threadIdx.x & (warpSize - 1)) == 0) {
517 half* loc = output + row;
518 atomicReduce((half2*)loc, val, reducer);
526 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
527 struct InnerReductionLauncher {
528 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
529 assert(
false &&
"Should only be called to reduce floats and half floats on a gpu device");
534 template <
typename Self,
typename Op,
bool PacketAccess>
535 struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
536 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
537 typedef typename Self::Index Index;
539 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
540 const int block_size = 256;
541 const int num_per_thread = 128;
542 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
543 const int max_blocks = device.getNumCudaMultiProcessors() *
544 device.maxCudaThreadsPerMultiProcessor() / block_size;
545 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
547 if (num_blocks > 1) {
550 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
551 const int max_blocks = device.getNumCudaMultiProcessors() *
552 device.maxCudaThreadsPerMultiProcessor() / 1024;
553 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
554 LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
555 num_blocks, 1024, 0, device, reducer.initialize(),
556 num_preserved_vals, output);
559 LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
560 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
566 #ifdef EIGEN_HAS_CUDA_FP16 567 template <
typename Self,
typename Op>
568 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
569 static bool run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index,
typename Self::Index) {
570 assert(
false &&
"Should not be called since there is no packet accessor");
575 template <
typename Self,
typename Op>
576 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
577 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
578 typedef typename Self::Index Index;
580 if (num_preserved_vals % 2 != 0) {
585 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
586 const int block_size = 128;
587 const int num_per_thread = 64;
588 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
589 const int max_blocks = device.getNumCudaMultiProcessors() *
590 device.maxCudaThreadsPerMultiProcessor() / block_size;
591 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
593 if (num_blocks > 1) {
596 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
597 const int max_blocks = device.getNumCudaMultiProcessors() *
598 device.maxCudaThreadsPerMultiProcessor() / 1024;
599 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
600 LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
601 1, 1, 0, device, reducer,
self, num_preserved_vals, output);
604 LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
605 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
613 template <
typename Self,
typename Op>
614 struct InnerReducer<Self, Op, GpuDevice> {
618 #ifdef EIGEN_HAS_CUDA_FP16 619 static const bool HasOptimizedImplementation = !Op::IsStateful &&
620 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
621 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
622 #elif __CUDA_ARCH__ >= 300 623 static const bool HasOptimizedImplementation = !Op::IsStateful &&
624 internal::is_same<typename Self::CoeffReturnType, float>::value;
626 static const bool HasOptimizedImplementation =
false;
629 template <
typename OutputType>
630 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
631 assert(HasOptimizedImplementation &&
"Should only be called on floats or half floats");
632 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
634 if (num_coeffs == 0) {
638 if (num_coeffs_to_reduce <= 128) {
642 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
646 template <
int NumPerThread,
typename Self,
647 typename Reducer,
typename Index>
648 __global__
void OuterReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
649 typename Self::CoeffReturnType* output) {
650 const Index num_threads = blockDim.x * gridDim.x;
651 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
653 if (gridDim.x == 1) {
654 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
655 output[i] = reducer.initialize();
661 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
662 for (Index i = thread_id; i < max_iter; i += num_threads) {
663 const Index input_col = i % num_preserved_coeffs;
664 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
665 typename Self::CoeffReturnType reduced_val = reducer.initialize();
666 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
667 for (Index j = input_row; j < max_row; j++) {
668 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
669 reducer.reduce(val, &reduced_val);
671 atomicReduce(&(output[input_col]), reduced_val, reducer);
676 template <
typename Self,
typename Op>
677 struct OuterReducer<Self, Op, GpuDevice> {
681 #if __CUDA_ARCH__ >= 300 682 static const bool HasOptimizedImplementation = !Op::IsStateful &&
683 internal::is_same<typename Self::CoeffReturnType, float>::value;
685 static const bool HasOptimizedImplementation =
false;
688 template <
typename Device,
typename OutputType>
689 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
690 assert(
false &&
"Should only be called to reduce floats on a gpu device");
694 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
695 typedef typename Self::Index Index;
698 if (num_coeffs_to_reduce <= 32) {
702 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
703 const int block_size = 256;
704 const int num_per_thread = 16;
705 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
706 const int max_blocks = device.getNumCudaMultiProcessors() *
707 device.maxCudaThreadsPerMultiProcessor() / block_size;
708 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
710 if (num_blocks > 1) {
713 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
714 const int max_blocks = device.getNumCudaMultiProcessors() *
715 device.maxCudaThreadsPerMultiProcessor() / 1024;
716 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
717 LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
718 num_blocks, 1024, 0, device, reducer.initialize(),
719 num_preserved_vals, output);
722 LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
723 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
735 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45