TensorReduction.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 template<typename Op, typename Dims, typename XprType>
24 struct traits<TensorReductionOp<Op, Dims, XprType> >
25  : traits<XprType>
26 {
27  typedef traits<XprType> XprTraits;
28  typedef typename XprTraits::Scalar Scalar;
29  typedef typename XprTraits::StorageKind StorageKind;
30  typedef typename XprTraits::Index Index;
31  typedef typename XprType::Nested Nested;
32  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
33  static const int Layout = XprTraits::Layout;
34 };
35 
36 template<typename Op, typename Dims, typename XprType>
37 struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
38 {
39  typedef const TensorReductionOp<Op, Dims, XprType>& type;
40 };
41 
42 template<typename Op, typename Dims, typename XprType>
43 struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
44 {
45  typedef TensorReductionOp<Op, Dims, XprType> type;
46 };
47 
48 
49 template <typename OutputDims> struct DimInitializer {
50  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
51  static void run(const InputDims& input_dims,
52  const array<bool, internal::array_size<InputDims>::value>& reduced,
53  OutputDims* output_dims, ReducedDims* reduced_dims) {
54  const int NumInputDims = internal::array_size<InputDims>::value;
55  int outputIndex = 0;
56  int reduceIndex = 0;
57  for (int i = 0; i < NumInputDims; ++i) {
58  if (reduced[i]) {
59  (*reduced_dims)[reduceIndex] = input_dims[i];
60  ++reduceIndex;
61  } else {
62  (*output_dims)[outputIndex] = input_dims[i];
63  ++outputIndex;
64  }
65  }
66  }
67 };
68 
69 template <> struct DimInitializer<Sizes<> > {
70  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
71  static void run(const InputDims& input_dims, const array<bool, Rank>&,
72  Sizes<>*, array<Index, Rank>* reduced_dims) {
73  const int NumInputDims = internal::array_size<InputDims>::value;
74  for (int i = 0; i < NumInputDims; ++i) {
75  (*reduced_dims)[i] = input_dims[i];
76  }
77  }
78 };
79 
80 
81 template <typename ReducedDims, int NumTensorDims, int Layout>
82 struct are_inner_most_dims {
83  static const bool value = false;
84 };
85 template <typename ReducedDims, int NumTensorDims, int Layout>
86 struct preserve_inner_most_dims {
87  static const bool value = false;
88 };
89 
90 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
91 template <typename ReducedDims, int NumTensorDims>
92 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
93  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
94  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
95  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
96  static const bool value = tmp1 & tmp2 & tmp3;
97 };
98 template <typename ReducedDims, int NumTensorDims>
99 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
100  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
101  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
102  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
103  static const bool value = tmp1 & tmp2 & tmp3;
104 
105 };
106 template <typename ReducedDims, int NumTensorDims>
107 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
108  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
109  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
110  static const bool value = tmp1 & tmp2;
111 
112 };
113 template <typename ReducedDims, int NumTensorDims>
114 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
115  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
116  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
117  static const bool value = tmp1 & tmp2;
118 };
119 #endif
120 
121 
122 template <int DimIndex, typename Self, typename Op>
123 struct GenericDimReducer {
124  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
125  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
126  for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
127  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
128  GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
129  }
130  }
131 };
132 template <typename Self, typename Op>
133 struct GenericDimReducer<0, Self, Op> {
134  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
135  for (int j = 0; j < self.m_reducedDims[0]; ++j) {
136  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
137  reducer.reduce(self.m_impl.coeff(input), accum);
138  }
139  }
140 };
141 template <typename Self, typename Op>
142 struct GenericDimReducer<-1, Self, Op> {
143  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
144  reducer.reduce(self.m_impl.coeff(index), accum);
145  }
146 };
147 
148 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
149 struct InnerMostDimReducer {
150  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
151  typename Self::CoeffReturnType accum = reducer.initialize();
152  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
153  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
154  }
155  return reducer.finalize(accum);
156  }
157 };
158 
159 template <typename Self, typename Op>
160 struct InnerMostDimReducer<Self, Op, true> {
161  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
162  const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
163  const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
164  typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
165  for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
166  reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
167  }
168  typename Self::CoeffReturnType accum = reducer.initialize();
169  for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
170  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
171  }
172  return reducer.finalizeBoth(accum, p);
173  }
174 };
175 
176 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
177 struct InnerMostDimPreserver {
178  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
179  eigen_assert(false && "should never be called");
180  }
181 };
182 
183 template <int DimIndex, typename Self, typename Op>
184 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
185  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
186  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
187  for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
188  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
189  InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
190  }
191  }
192 };
193 
194 template <typename Self, typename Op>
195 struct InnerMostDimPreserver<0, Self, Op, true> {
196  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
197  for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
198  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
199  reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
200  }
201  }
202 };
203 template <typename Self, typename Op>
204 struct InnerMostDimPreserver<-1, Self, Op, true> {
205  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
206  eigen_assert(false && "should never be called");
207  }
208 };
209 
210 // Default full reducer
211 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
212 struct FullReducer {
213  static const bool HasOptimizedImplementation = false;
214 
215  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
216  const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
217  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
218  }
219 };
220 
221 
222 #ifdef EIGEN_USE_THREADS
223 // Multithreaded full reducers
224 template <typename Self, typename Op,
225  bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
226 struct FullReducerShard {
227  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
228  typename Self::Index numValuesToReduce, Op& reducer,
229  typename Self::CoeffReturnType* output) {
230  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
231  self, firstIndex, numValuesToReduce, reducer);
232  }
233 };
234 
235 // Multithreaded full reducer
236 template <typename Self, typename Op, bool Vectorizable>
237 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
238  static const bool HasOptimizedImplementation = !Op::IsStateful;
239  static const int PacketSize =
240  unpacket_traits<typename Self::PacketReturnType>::size;
241 
242  // launch one reducer per thread and accumulate the result.
243  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
244  typename Self::CoeffReturnType* output) {
245  typedef typename Self::Index Index;
246  const Index num_coeffs = array_prod(self.m_impl.dimensions());
247  if (num_coeffs == 0) {
248  *output = reducer.finalize(reducer.initialize());
249  return;
250  }
251  const TensorOpCost cost =
252  self.m_impl.costPerCoeff(Vectorizable) +
253  TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
254  PacketSize);
255  const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
256  num_coeffs, cost, device.numThreads());
257  if (num_threads == 1) {
258  *output =
259  InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
260  return;
261  }
262  const Index blocksize =
263  std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
264  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
265  eigen_assert(num_coeffs >= numblocks * blocksize);
266 
267  Barrier barrier(internal::convert_index<unsigned int>(numblocks));
268  MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
269  for (Index i = 0; i < numblocks; ++i) {
270  device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
271  self, i * blocksize, blocksize, reducer,
272  &shards[i]);
273  }
274  typename Self::CoeffReturnType finalShard;
275  if (numblocks * blocksize < num_coeffs) {
276  finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
277  self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
278  reducer);
279  } else {
280  finalShard = reducer.initialize();
281  }
282  barrier.Wait();
283 
284  for (Index i = 0; i < numblocks; ++i) {
285  reducer.reduce(shards[i], &finalShard);
286  }
287  *output = reducer.finalize(finalShard);
288  }
289 };
290 
291 #endif
292 
293 
294 // Default inner reducer
295 template <typename Self, typename Op, typename Device>
296 struct InnerReducer {
297  static const bool HasOptimizedImplementation = false;
298 
299  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
300  eigen_assert(false && "Not implemented");
301  return true;
302  }
303 };
304 
305 // Default outer reducer
306 template <typename Self, typename Op, typename Device>
307 struct OuterReducer {
308  static const bool HasOptimizedImplementation = false;
309 
310  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
311  eigen_assert(false && "Not implemented");
312  return true;
313  }
314 };
315 
316 
317 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
318 template <int B, int N, typename S, typename R, typename I>
319 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
320 
321 
322 #ifdef EIGEN_HAS_CUDA_FP16
323 template <typename S, typename R, typename I>
324 __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
325 template <int B, int N, typename S, typename R, typename I>
326 __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
327 template <int NPT, typename S, typename R, typename I>
328 __global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
329 
330 #endif
331 
332 template <int NPT, typename S, typename R, typename I>
333 __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
334 
335 template <int NPT, typename S, typename R, typename I>
336 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
337 #endif
338 
339 } // end namespace internal
340 
341 
342 template <typename Op, typename Dims, typename XprType>
343 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
344  public:
345  typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
346  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
347  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
348  typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
349  typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
350  typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
351 
352  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
353  TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
354  { }
355  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
356  TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
357  { }
358 
359  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
360  const XprType& expression() const { return m_expr; }
361  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
362  const Dims& dims() const { return m_dims; }
363  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
364  const Op& reducer() const { return m_reducer; }
365 
366  protected:
367  typename XprType::Nested m_expr;
368  const Dims m_dims;
369  const Op m_reducer;
370 };
371 
372 
373 // Eval as rvalue
374 template<typename Op, typename Dims, typename ArgType, typename Device>
375 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
376 {
377  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
378  typedef typename XprType::Index Index;
379  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
380  static const int NumInputDims = internal::array_size<InputDimensions>::value;
381  static const int NumReducedDims = internal::array_size<Dims>::value;
382  static const int NumOutputDims = NumInputDims - NumReducedDims;
383  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
384  typedef typename XprType::Scalar Scalar;
385  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
386  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
387  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
388  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
389  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
390 
391  enum {
392  IsAligned = false,
393  PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
394  Layout = TensorEvaluator<ArgType, Device>::Layout,
395  CoordAccess = false, // to be implemented
396  RawAccess = false
397  };
398 
399  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
400  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
401  static const bool RunningFullReduction = (NumOutputDims==0);
402 
403  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
404  : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
405  {
406  EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
407  EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
408  YOU_MADE_A_PROGRAMMING_MISTAKE);
409 
410  // Build the bitmap indicating if an input dimension is reduced or not.
411  for (int i = 0; i < NumInputDims; ++i) {
412  m_reduced[i] = false;
413  }
414  for (int i = 0; i < NumReducedDims; ++i) {
415  eigen_assert(op.dims()[i] >= 0);
416  eigen_assert(op.dims()[i] < NumInputDims);
417  m_reduced[op.dims()[i]] = true;
418  }
419 
420  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
421  internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
422 
423  // Precompute output strides.
424  if (NumOutputDims > 0) {
425  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
426  m_outputStrides[0] = 1;
427  for (int i = 1; i < NumOutputDims; ++i) {
428  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
429  }
430  } else {
431  m_outputStrides.back() = 1;
432  for (int i = NumOutputDims - 2; i >= 0; --i) {
433  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
434  }
435  }
436  }
437 
438  // Precompute input strides.
439  if (NumInputDims > 0) {
440  array<Index, NumInputDims> input_strides;
441  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
442  input_strides[0] = 1;
443  for (int i = 1; i < NumInputDims; ++i) {
444  input_strides[i] = input_strides[i-1] * input_dims[i-1];
445  }
446  } else {
447  input_strides.back() = 1;
448  for (int i = NumInputDims - 2; i >= 0; --i) {
449  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
450  }
451  }
452 
453  int outputIndex = 0;
454  int reduceIndex = 0;
455  for (int i = 0; i < NumInputDims; ++i) {
456  if (m_reduced[i]) {
457  m_reducedStrides[reduceIndex] = input_strides[i];
458  ++reduceIndex;
459  } else {
460  m_preservedStrides[outputIndex] = input_strides[i];
461  ++outputIndex;
462  }
463  }
464  }
465 
466  // Special case for full reductions
467  if (NumOutputDims == 0) {
468  m_preservedStrides[0] = internal::array_prod(input_dims);
469  }
470  }
471 
472  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
473 
474  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
475  m_impl.evalSubExprsIfNeeded(NULL);
476 
477  // Use the FullReducer if possible.
478  if (RunningFullReduction &&
479  internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
480  ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
481  !RunningOnGPU)) {
482  bool need_assign = false;
483  if (!data) {
484  m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
485  data = m_result;
486  need_assign = true;
487  }
488 
489  Op reducer(m_reducer);
490  internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
491  return need_assign;
492  }
493 
494  // Attempt to use an optimized reduction.
495  else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
496  bool reducing_inner_dims = true;
497  for (int i = 0; i < NumReducedDims; ++i) {
498  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
499  reducing_inner_dims &= m_reduced[i];
500  } else {
501  reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
502  }
503  }
504  if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
505  (reducing_inner_dims || ReducingInnerMostDims)) {
506  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
507  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
508  if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve) {
509  data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
510  m_result = data;
511  }
512  Op reducer(m_reducer);
513  return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve) || (m_result != NULL);
514  }
515 
516  bool preserving_inner_dims = true;
517  for (int i = 0; i < NumReducedDims; ++i) {
518  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
519  preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
520  } else {
521  preserving_inner_dims &= m_reduced[i];
522  }
523  }
524  if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
525  preserving_inner_dims) {
526  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
527  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
528  if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve) {
529  data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
530  m_result = data;
531  }
532  Op reducer(m_reducer);
533  return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve) || (m_result != NULL);
534  }
535  }
536  return true;
537  }
538 
539  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
540  m_impl.cleanup();
541  if (m_result) {
542  m_device.deallocate(m_result);
543  }
544  }
545 
546  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
547  {
548  if ((RunningFullReduction || RunningOnGPU) && m_result) {
549  return *(m_result + index);
550  }
551  Op reducer(m_reducer);
552  if (ReducingInnerMostDims || RunningFullReduction) {
553  const Index num_values_to_reduce =
554  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
555  return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
556  num_values_to_reduce, reducer);
557  } else {
558  typename Self::CoeffReturnType accum = reducer.initialize();
559  internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
560  return reducer.finalize(accum);
561  }
562  }
563 
564  // TODO(bsteiner): provide a more efficient implementation.
565  template<int LoadMode>
566  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
567  {
568  EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
569  eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
570 
571  if (RunningOnGPU && m_result) {
572  return internal::pload<PacketReturnType>(m_result + index);
573  }
574 
575  EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
576  if (ReducingInnerMostDims) {
577  const Index num_values_to_reduce =
578  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
579  const Index firstIndex = firstInput(index);
580  for (Index i = 0; i < PacketSize; ++i) {
581  Op reducer(m_reducer);
582  values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
583  num_values_to_reduce, reducer);
584  }
585  } else if (PreservingInnerMostDims) {
586  const Index firstIndex = firstInput(index);
587  const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
588  // TBD: extend this the the n innermost dimensions that we preserve.
589  if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
590  Op reducer(m_reducer);
591  typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
592  internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
593  return reducer.finalizePacket(accum);
594  } else {
595  for (int i = 0; i < PacketSize; ++i) {
596  values[i] = coeff(index + i);
597  }
598  }
599  } else {
600  for (int i = 0; i < PacketSize; ++i) {
601  values[i] = coeff(index + i);
602  }
603  }
604  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
605  return rslt;
606  }
607 
608  // Must be called after evalSubExprsIfNeeded().
609  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
610  if (RunningFullReduction && m_result) {
611  return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
612  } else {
613  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
614  const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
615  return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
616  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
617  }
618  }
619 
620  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
621 
622  private:
623  template <int, typename, typename> friend struct internal::GenericDimReducer;
624  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
625  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
626  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
627 #ifdef EIGEN_USE_THREADS
628  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
629 #endif
630 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
631  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
632 #ifdef EIGEN_HAS_CUDA_FP16
633  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
634  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
635  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
636 #endif
637  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
638 
639  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
640 #endif
641 
642  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
643 
644  // Returns the Index in the input tensor of the first value that needs to be
645  // used to compute the reduction at output index "index".
646  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
647  if (ReducingInnerMostDims) {
648  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
649  return index * m_preservedStrides[0];
650  } else {
651  return index * m_preservedStrides[NumPreservedStrides - 1];
652  }
653  }
654  // TBD: optimize the case where we preserve the innermost dimensions.
655  Index startInput = 0;
656  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
657  for (int i = NumOutputDims - 1; i > 0; --i) {
658  // This is index_i in the output tensor.
659  const Index idx = index / m_outputStrides[i];
660  startInput += idx * m_preservedStrides[i];
661  index -= idx * m_outputStrides[i];
662  }
663  if (PreservingInnerMostDims) {
664  eigen_assert(m_preservedStrides[0] == 1);
665  startInput += index;
666  } else {
667  startInput += index * m_preservedStrides[0];
668  }
669  } else {
670  for (int i = 0; i < NumOutputDims - 1; ++i) {
671  // This is index_i in the output tensor.
672  const Index idx = index / m_outputStrides[i];
673  startInput += idx * m_preservedStrides[i];
674  index -= idx * m_outputStrides[i];
675  }
676  if (PreservingInnerMostDims) {
677  eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
678  startInput += index;
679  } else {
680  startInput += index * m_preservedStrides[NumPreservedStrides - 1];
681  }
682  }
683  return startInput;
684  }
685 
686  // Bitmap indicating if an input dimension is reduced or not.
687  array<bool, NumInputDims> m_reduced;
688  // Dimensions of the output of the operation.
689  Dimensions m_dimensions;
690  // Precomputed strides for the output tensor.
691  array<Index, NumOutputDims> m_outputStrides;
692  // Subset of strides of the input tensor for the non-reduced dimensions.
693  // Indexed by output dimensions.
694  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
695  array<Index, NumPreservedStrides> m_preservedStrides;
696 
697  // Subset of strides of the input tensor for the reduced dimensions.
698  // Indexed by reduced dimensions.
699  array<Index, NumReducedDims> m_reducedStrides;
700  // Size of the input dimensions that are reduced.
701  // Indexed by reduced dimensions.
702  array<Index, NumReducedDims> m_reducedDims;
703 
704  // Evaluator for the input expression.
705  TensorEvaluator<ArgType, Device> m_impl;
706 
707  // Operation to apply for computing the reduction.
708  Op m_reducer;
709 
710  // For full reductions
711 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
712  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
713 #else
714  static const bool RunningOnGPU = false;
715 #endif
716  CoeffReturnType* m_result;
717 
718  const Device& m_device;
719 };
720 
721 } // end namespace Eigen
722 
723 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45