10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
23 template<
typename Op,
typename Dims,
typename XprType>
24 struct traits<TensorReductionOp<Op, Dims, XprType> >
27 typedef typename traits<XprType>::Scalar Scalar;
28 typedef typename internal::packet_traits<Scalar>::type Packet;
29 typedef typename traits<XprType>::StorageKind StorageKind;
30 typedef typename traits<XprType>::Index Index;
31 typedef typename XprType::Nested Nested;
34 template<
typename Op,
typename Dims,
typename XprType>
35 struct eval<TensorReductionOp<Op, Dims, XprType>,
Eigen::Dense>
37 typedef const TensorReductionOp<Op, Dims, XprType>& type;
40 template<
typename Op,
typename Dims,
typename XprType>
41 struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
43 typedef TensorReductionOp<Op, Dims, XprType> type;
47 template <
typename OutputDims>
struct DimInitializer {
48 template <
typename InputDims,
typename ReducedDims> EIGEN_DEVICE_FUNC
49 static void run(
const InputDims& input_dims,
50 const array<
bool, internal::array_size<InputDims>::value>& reduced,
51 OutputDims* output_dims, ReducedDims* reduced_dims) {
52 const int NumInputDims = internal::array_size<InputDims>::value;
55 for (
int i = 0; i < NumInputDims; ++i) {
57 (*reduced_dims)[reduceIndex] = input_dims[i];
60 (*output_dims)[outputIndex] = input_dims[i];
67 template <>
struct DimInitializer<Sizes<> > {
68 template <
typename InputDims,
typename Index,
size_t Rank> EIGEN_DEVICE_FUNC
69 static void run(
const InputDims& input_dims,
const array<bool, Rank>&,
70 Sizes<>*, array<Index, Rank>* reduced_dims) {
71 const int NumInputDims = internal::array_size<InputDims>::value;
72 for (
int i = 0; i < NumInputDims; ++i) {
73 (*reduced_dims)[i] = input_dims[i];
79 template <
typename ReducedDims,
int NumTensorDims,
int Layout>
80 struct are_inner_most_dims {
81 static const bool value =
false;
83 template <
typename ReducedDims,
int NumTensorDims,
int Layout>
84 struct preserve_inner_most_dims {
85 static const bool value =
false;
88 #if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
89 template <
typename ReducedDims,
int NumTensorDims>
90 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
91 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
92 static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
93 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
94 static const bool value = tmp1 & tmp2 & tmp3;
96 template <
typename ReducedDims,
int NumTensorDims>
97 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
98 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
99 static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
100 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
101 static const bool value = tmp1 & tmp2 & tmp3;
104 template <
typename ReducedDims,
int NumTensorDims>
105 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
106 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
107 static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
108 static const bool value = tmp1 & tmp2;
111 template <
typename ReducedDims,
int NumTensorDims>
112 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
113 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
114 static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
115 static const bool value = tmp1 & tmp2;
120 template <
int DimIndex,
typename Self,
typename Op>
121 struct GenericDimReducer {
122 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::CoeffReturnType* accum) {
123 EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
124 for (
int j = 0; j <
self.m_reducedDims[DimIndex]; ++j) {
125 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[DimIndex];
126 GenericDimReducer<DimIndex-1, Self, Op>::reduce(
self, input, reducer, accum);
130 template <
typename Self,
typename Op>
131 struct GenericDimReducer<0, Self, Op> {
132 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::CoeffReturnType* accum) {
133 for (
int j = 0; j <
self.m_reducedDims[0]; ++j) {
134 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[0];
135 reducer.reduce(
self.m_impl.coeff(input), accum);
139 template <
typename Self,
typename Op>
140 struct GenericDimReducer<-1, Self, Op> {
141 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index index, Op& reducer,
typename Self::CoeffReturnType* accum) {
142 reducer.reduce(
self.m_impl.coeff(index), accum);
146 template <
typename Self,
typename Op,
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
147 struct InnerMostDimReducer {
148 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
const Self&
self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
149 typename Self::CoeffReturnType accum = reducer.initialize();
150 for (
typename Self::Index j = 0; j < numValuesToReduce; ++j) {
151 reducer.reduce(
self.m_impl.coeff(firstIndex + j), &accum);
153 return reducer.finalize(accum);
157 template <
typename Self,
typename Op>
158 struct InnerMostDimReducer<Self, Op, true> {
159 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
const Self&
self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
160 const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
161 const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
162 typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
163 for (
typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
164 reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
166 typename Self::CoeffReturnType accum = reducer.initialize();
167 for (
typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
168 reducer.reduce(
self.m_impl.coeff(firstIndex + j), &accum);
170 return reducer.finalizeBoth(accum, p);
174 template <
int DimIndex,
typename Self,
typename Op,
bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
175 struct InnerMostDimPreserver {
176 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&,
typename Self::Index, Op&,
typename Self::PacketReturnType*) {
177 eigen_assert(
false &&
"should never be called");
181 template <
int DimIndex,
typename Self,
typename Op>
182 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
183 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::PacketReturnType* accum) {
184 EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
185 for (
typename Self::Index j = 0; j <
self.m_reducedDims[DimIndex]; ++j) {
186 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[DimIndex];
187 InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(
self, input, reducer, accum);
192 template <
typename Self,
typename Op>
193 struct InnerMostDimPreserver<0, Self, Op, true> {
194 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::PacketReturnType* accum) {
195 for (
typename Self::Index j = 0; j <
self.m_reducedDims[0]; ++j) {
196 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[0];
197 reducer.reducePacket(
self.m_impl.template packet<Unaligned>(input), accum);
201 template <
typename Self,
typename Op>
202 struct InnerMostDimPreserver<-1, Self, Op, true> {
203 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&,
typename Self::Index, Op&,
typename Self::PacketReturnType*) {
204 eigen_assert(
false &&
"should never be called");
209 template <
typename Self,
typename Op,
typename Device,
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
211 static const bool HasOptimizedImplementation =
false;
213 static EIGEN_DEVICE_FUNC
void run(
const Self&
self, Op& reducer,
const Device&,
typename Self::CoeffReturnType* output) {
214 const typename Self::Index num_coeffs = array_prod(
self.m_impl.dimensions());
215 *output = InnerMostDimReducer<Self, Op>::reduce(
self, 0, num_coeffs, reducer);
220 #ifdef EIGEN_USE_THREADS
222 template <
typename Eval,
typename Op,
bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)>
223 struct FullReducerShard {
224 static void run(
const Eval& eval,
typename Eval::Index firstIndex,
typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
226 shard->saccum = reducer.initialize();
227 for (
typename Eval::Index j = 0; j < numValuesToReduce; ++j) {
228 reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
232 typename Eval::CoeffReturnType saccum;
235 template <
typename Eval,
typename Op>
236 struct FullReducerShard<Eval, Op, true> {
237 static void run(
const Eval& eval,
typename Eval::Index firstIndex,
typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
239 const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size;
240 const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
242 shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>();
243 for (
typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) {
244 reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum);
246 shard->saccum = reducer.initialize();
247 for (
typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
248 reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
252 typename Eval::PacketReturnType paccum;
253 typename Eval::CoeffReturnType saccum;
257 template <
typename Self,
typename Op>
258 struct FullReducer<Self, Op, ThreadPoolDevice, false> {
259 static const bool HasOptimizedImplementation = !Op::IsStateful;
262 static void run(
const Self&
self, Op& reducer,
const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
263 typedef typename Self::Index Index;
264 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
265 const Index blocksize = std::floor<Index>(
static_cast<float>(num_coeffs)/device.numThreads());
266 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
267 eigen_assert(num_coeffs >= numblocks * blocksize);
269 std::vector<Notification*> results;
270 results.reserve(numblocks);
271 std::vector<FullReducerShard<Self, Op, false> > shards;
272 shards.resize(numblocks);
273 for (Index i = 0; i < numblocks; ++i) {
274 results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run,
self, i*blocksize, blocksize, reducer, &shards[i]));
277 FullReducerShard<Self, Op, false> finalShard;
278 if (numblocks * blocksize < num_coeffs) {
279 FullReducerShard<Self, Op, false>::run(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
281 finalShard.saccum = reducer.initialize();
284 for (Index i = 0; i < numblocks; ++i) {
285 wait_until_ready(results[i]);
289 for (Index i = 0; i < numblocks; ++i) {
290 reducer.reduce(shards[i].saccum, &finalShard.saccum);
292 *output = reducer.finalize(finalShard.saccum);
296 template <
typename Self,
typename Op>
297 struct FullReducer<Self, Op, ThreadPoolDevice, true> {
298 static const bool HasOptimizedImplementation = !Op::IsStateful;
301 static void run(
const Self&
self, Op& reducer,
const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
302 typedef typename Self::Index Index;
303 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
304 const Index blocksize = std::floor<Index>(
static_cast<float>(num_coeffs)/device.numThreads());
305 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
306 eigen_assert(num_coeffs >= numblocks * blocksize);
308 std::vector<Notification*> results;
309 results.reserve(numblocks);
310 std::vector<FullReducerShard<Self, Op, true> > shards;
311 shards.resize(numblocks);
312 for (Index i = 0; i < numblocks; ++i) {
313 results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
self, i*blocksize, blocksize, reducer, &shards[i]));
316 FullReducerShard<Self, Op, true> finalShard;
317 if (numblocks * blocksize < num_coeffs) {
318 FullReducerShard<Self, Op, true>::run(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
320 finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
321 finalShard.saccum = reducer.initialize();
324 for (Index i = 0; i < numblocks; ++i) {
325 wait_until_ready(results[i]);
329 for (Index i = 0; i < numblocks; ++i) {
330 reducer.reducePacket(shards[i].paccum, &finalShard.paccum);
331 reducer.reduce(shards[i].saccum, &finalShard.saccum);
334 *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum);
340 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
341 template <
int B,
int N,
typename S,
typename R,
typename I>
342 __global__
void FullReductionKernel(R,
const S, I,
typename S::CoeffReturnType*);
348 template <
typename Op,
typename Dims,
typename XprType>
349 class TensorReductionOp :
public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
351 typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
352 typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
353 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
354 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
355 typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
356 typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
357 typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
358 typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
360 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
361 TensorReductionOp(
const XprType& expr,
const Dims& dims) : m_expr(expr), m_dims(dims)
363 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
364 TensorReductionOp(
const XprType& expr,
const Dims& dims,
const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
367 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
368 const XprType& expression()
const {
return m_expr; }
369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
370 const Dims& dims()
const {
return m_dims; }
371 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
372 const Op& reducer()
const {
return m_reducer; }
375 typename XprType::Nested m_expr;
382 template<
typename Op,
typename Dims,
typename ArgType,
typename Device>
383 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
385 typedef TensorReductionOp<Op, Dims, ArgType> XprType;
386 typedef typename XprType::Index Index;
387 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
388 static const int NumInputDims = internal::array_size<InputDimensions>::value;
389 static const int NumReducedDims = internal::array_size<Dims>::value;
390 static const int NumOutputDims = NumInputDims - NumReducedDims;
391 typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
392 typedef typename XprType::Scalar Scalar;
393 typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
394 static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
398 PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
399 Layout = TensorEvaluator<ArgType, Device>::Layout,
403 static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
404 static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
405 static const bool RunningFullReduction = (NumOutputDims==0);
407 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
408 : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
410 EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
411 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
412 YOU_MADE_A_PROGRAMMING_MISTAKE);
415 array<bool, NumInputDims> reduced;
416 for (
int i = 0; i < NumInputDims; ++i) {
419 for (
int i = 0; i < NumReducedDims; ++i) {
420 eigen_assert(op.dims()[i] >= 0);
421 eigen_assert(op.dims()[i] < NumInputDims);
422 reduced[op.dims()[i]] =
true;
425 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
426 internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
429 if (NumOutputDims > 0) {
430 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
431 m_outputStrides[0] = 1;
432 for (
int i = 1; i < NumOutputDims; ++i) {
433 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
436 m_outputStrides[NumOutputDims - 1] = 1;
437 for (
int i = NumOutputDims - 2; i >= 0; --i) {
438 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
444 if (NumInputDims > 0) {
445 array<Index, NumInputDims> input_strides;
446 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
447 input_strides[0] = 1;
448 for (
int i = 1; i < NumInputDims; ++i) {
449 input_strides[i] = input_strides[i-1] * input_dims[i-1];
452 input_strides[NumInputDims - 1] = 1;
453 for (
int i = NumInputDims - 2; i >= 0; --i) {
454 input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
460 for (
int i = 0; i < NumInputDims; ++i) {
462 m_reducedStrides[reduceIndex] = input_strides[i];
465 m_preservedStrides[outputIndex] = input_strides[i];
472 if (NumOutputDims == 0) {
473 m_preservedStrides[0] = internal::array_prod(input_dims);
477 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
479 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
480 typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
482 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(CoeffReturnType* data) {
483 m_impl.evalSubExprsIfNeeded(NULL);
486 if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
487 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
488 (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) {
490 bool need_assign =
false;
492 m_result =
static_cast<CoeffReturnType*
>(m_device.allocate(
sizeof(CoeffReturnType)));
497 Op reducer(m_reducer);
498 internal::FullReducer<Self, Op, Device>::run(*
this, reducer, m_device, data);
504 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void cleanup() {
507 m_device.deallocate(m_result);
511 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
513 if (RunningFullReduction && m_result) {
516 Op reducer(m_reducer);
517 if (ReducingInnerMostDims || RunningFullReduction) {
518 const Index num_values_to_reduce =
519 (
static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
520 return internal::InnerMostDimReducer<Self, Op>::reduce(*
this, firstInput(index),
521 num_values_to_reduce, reducer);
523 typename Self::CoeffReturnType accum = reducer.initialize();
524 internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*
this, firstInput(index), reducer, &accum);
525 return reducer.finalize(accum);
530 template<
int LoadMode>
531 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index)
const
533 const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
534 EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
535 eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
537 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
538 if (ReducingInnerMostDims) {
539 const Index num_values_to_reduce =
540 (
static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
541 const Index firstIndex = firstInput(index);
542 for (Index i = 0; i < packetSize; ++i) {
543 Op reducer(m_reducer);
544 values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*
this, firstIndex + i * num_values_to_reduce,
545 num_values_to_reduce, reducer);
547 }
else if (PreservingInnerMostDims) {
548 const Index firstIndex = firstInput(index);
549 const int innermost_dim = (
static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
551 if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
552 Op reducer(m_reducer);
553 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
554 internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*
this, firstIndex, reducer, &accum);
555 return reducer.finalizePacket(accum);
557 for (
int i = 0; i < packetSize; ++i) {
558 values[i] = coeff(index + i);
562 for (
int i = 0; i < packetSize; ++i) {
563 values[i] = coeff(index + i);
566 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
570 EIGEN_DEVICE_FUNC Scalar* data()
const {
return NULL; }
573 template <
int,
typename,
typename>
friend struct internal::GenericDimReducer;
574 template <
typename,
typename,
bool>
friend struct internal::InnerMostDimReducer;
575 template <
int,
typename,
typename,
bool>
friend struct internal::InnerMostDimPreserver;
576 template <
typename S,
typename O,
typename D,
bool V>
friend struct internal::FullReducer;
577 #ifdef EIGEN_USE_THREADS
578 template <
typename S,
typename O,
bool V>
friend struct internal::FullReducerShard;
580 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
581 template <
int B,
int N,
typename S,
typename R,
typename I>
friend void internal::FullReductionKernel(R,
const S, I,
typename S::CoeffReturnType*);
586 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index)
const {
587 if (ReducingInnerMostDims) {
588 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
589 return index * m_preservedStrides[0];
591 return index * m_preservedStrides[NumPreservedStrides - 1];
595 Index startInput = 0;
596 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
597 for (
int i = NumOutputDims - 1; i > 0; --i) {
599 const Index idx = index / m_outputStrides[i];
600 startInput += idx * m_preservedStrides[i];
601 index -= idx * m_outputStrides[i];
603 if (PreservingInnerMostDims) {
604 eigen_assert(m_preservedStrides[0] == 1);
607 startInput += index * m_preservedStrides[0];
610 for (
int i = 0; i < NumOutputDims - 1; ++i) {
612 const Index idx = index / m_outputStrides[i];
613 startInput += idx * m_preservedStrides[i];
614 index -= idx * m_outputStrides[i];
616 if (PreservingInnerMostDims) {
617 eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
620 startInput += index * m_preservedStrides[NumPreservedStrides - 1];
627 Dimensions m_dimensions;
629 array<Index, NumOutputDims> m_outputStrides;
632 static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
633 array<Index, NumPreservedStrides> m_preservedStrides;
637 array<Index, NumReducedDims> m_reducedStrides;
640 array<Index, NumReducedDims> m_reducedDims;
643 TensorEvaluator<ArgType, Device> m_impl;
649 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
650 static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
652 static const bool RunningOnGPU =
false;
654 CoeffReturnType* m_result;
656 const Device& m_device;
661 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13