Eigen  3.2.93
GeneralBlockPanelKernel.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
18 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
19 class gebp_traits;
20 
21 
23 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
24 {
25  return a<=0 ? b : a;
26 }
27 
28 #if EIGEN_ARCH_i386_OR_x86_64
29 const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30 const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31 const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
32 #else
33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
36 #endif
37 
39 struct CacheSizes {
40  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
42  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
43  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
44  m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
45  m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
46  }
47 
48  std::ptrdiff_t m_l1;
49  std::ptrdiff_t m_l2;
50  std::ptrdiff_t m_l3;
51 };
52 
53 
55 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
56 {
57  static CacheSizes m_cacheSizes;
58 
59  if(action==SetAction)
60  {
61  // set the cpu cache size and cache all block sizes from a global cache size in byte
62  eigen_internal_assert(l1!=0 && l2!=0);
63  m_cacheSizes.m_l1 = *l1;
64  m_cacheSizes.m_l2 = *l2;
65  m_cacheSizes.m_l3 = *l3;
66  }
67  else if(action==GetAction)
68  {
69  eigen_internal_assert(l1!=0 && l2!=0);
70  *l1 = m_cacheSizes.m_l1;
71  *l2 = m_cacheSizes.m_l2;
72  *l3 = m_cacheSizes.m_l3;
73  }
74  else
75  {
76  eigen_internal_assert(false);
77  }
78 }
79 
80 /* Helper for computeProductBlockingSizes.
81  *
82  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
83  * this function computes the blocking size parameters along the respective dimensions
84  * for matrix products and related algorithms. The blocking sizes depends on various
85  * parameters:
86  * - the L1 and L2 cache sizes,
87  * - the register level blocking sizes defined by gebp_traits,
88  * - the number of scalars that fit into a packet (when vectorization is enabled).
89  *
90  * \sa setCpuCacheSizes */
91 
92 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
93 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
94 {
95  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
96 
97  // Explanations:
98  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
99  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
100  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
101  // at the register level. This small horizontal panel has to stay within L1 cache.
102  std::ptrdiff_t l1, l2, l3;
103  manage_caching_sizes(GetAction, &l1, &l2, &l3);
104 
105  if (num_threads > 1) {
106  typedef typename Traits::ResScalar ResScalar;
107  enum {
108  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
109  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
110  kr = 8,
111  mr = Traits::mr,
112  nr = Traits::nr
113  };
114  // Increasing k gives us more time to prefetch the content of the "C"
115  // registers. However once the latency is hidden there is no point in
116  // increasing the value of k, so we'll cap it at 320 (value determined
117  // experimentally).
118  const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
119  if (k_cache < k) {
120  k = k_cache - (k_cache % kr);
121  eigen_internal_assert(k > 0);
122  }
123 
124  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
125  const Index n_per_thread = numext::div_ceil(n, num_threads);
126  if (n_cache <= n_per_thread) {
127  // Don't exceed the capacity of the l2 cache.
128  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
129  n = n_cache - (n_cache % nr);
130  eigen_internal_assert(n > 0);
131  } else {
132  n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
133  }
134 
135  if (l3 > l2) {
136  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
137  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
138  const Index m_per_thread = numext::div_ceil(m, num_threads);
139  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140  m = m_cache - (m_cache % mr);
141  eigen_internal_assert(m > 0);
142  } else {
143  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
144  }
145  }
146  }
147  else {
148  // In unit tests we do not want to use extra large matrices,
149  // so we reduce the cache size to check the blocking strategy is not flawed
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
151  l1 = 9*1024;
152  l2 = 32*1024;
153  l3 = 512*1024;
154 #endif
155 
156  // Early return for small problems because the computation below are time consuming for small problems.
157  // Perhaps it would make more sense to consider k*n*m??
158  // Note that for very tiny problem, this function should be bypassed anyway
159  // because we use the coefficient-based implementation for them.
160  if((numext::maxi)(k,(numext::maxi)(m,n))<48)
161  return;
162 
163  typedef typename Traits::ResScalar ResScalar;
164  enum {
165  k_peeling = 8,
166  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
167  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
168  };
169 
170  // ---- 1st level of blocking on L1, yields kc ----
171 
172  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
173  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
174  // We also include a register-level block of the result (mx x nr).
175  // (In an ideal world only the lhs panel would stay in L1)
176  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
177  const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178  const Index old_k = k;
179  if(k>max_kc)
180  {
181  // We are really blocking on the third dimension:
182  // -> reduce blocking size to make sure the last block is as large as possible
183  // while keeping the same number of sweeps over the result.
184  k = (k%max_kc)==0 ? max_kc
185  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
186 
187  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
188  }
189 
190  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
191 
192  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
193  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
194  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
195  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
196  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
197  const Index actual_l2 = l3;
198  #else
199  const Index actual_l2 = 1572864; // == 1.5 MB
200  #endif
201 
202  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
203  // The second half is implicitly reserved to access the result and lhs coefficients.
204  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
205  // to limit this growth: we bound nc to growth by a factor x1.5.
206  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
207  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
208  Index max_nc;
209  const Index lhs_bytes = m * k * sizeof(LhsScalar);
210  const Index remaining_l1 = l1- k_sub - lhs_bytes;
211  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
212  {
213  // L1 blocking
214  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
215  }
216  else
217  {
218  // L2 blocking
219  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
220  }
221  // WARNING Below, we assume that Traits::nr is a power of two.
222  Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
223  if(n>nc)
224  {
225  // We are really blocking over the columns:
226  // -> reduce blocking size to make sure the last block is as large as possible
227  // while keeping the same number of sweeps over the packed lhs.
228  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
229  n = (n%nc)==0 ? nc
230  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
231  }
232  else if(old_k==k)
233  {
234  // So far, no blocking at all, i.e., kc==k, and nc==n.
235  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
236  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
237  Index problem_size = k*n*sizeof(LhsScalar);
238  Index actual_lm = actual_l2;
239  Index max_mc = m;
240  if(problem_size<=1024)
241  {
242  // problem is small enough to keep in L1
243  // Let's choose m such that lhs's block fit in 1/3 of L1
244  actual_lm = l1;
245  }
246  else if(l3!=0 && problem_size<=32768)
247  {
248  // we have both L2 and L3, and problem is small enough to be kept in L2
249  // Let's choose m such that lhs's block fit in 1/3 of L2
250  actual_lm = l2;
251  max_mc = (numext::mini<Index>)(576,max_mc);
252  }
253  Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
254  if (mc > Traits::mr) mc -= mc % Traits::mr;
255  else if (mc==0) return;
256  m = (m%mc)==0 ? mc
257  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
258  }
259  }
260 }
261 
262 template <typename Index>
263 inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
264 {
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
266  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
267  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268  m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269  n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
270  return true;
271  }
272 #else
273  EIGEN_UNUSED_VARIABLE(k)
274  EIGEN_UNUSED_VARIABLE(m)
275  EIGEN_UNUSED_VARIABLE(n)
276 #endif
277  return false;
278 }
279 
296 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
297 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
298 {
299  if (!useSpecificBlockingSizes(k, m, n)) {
300  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
301  }
302 }
303 
304 template<typename LhsScalar, typename RhsScalar, typename Index>
305 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
306 {
307  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
308 }
309 
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
311  #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
312 #else
313 
314  // FIXME (a bit overkill maybe ?)
315 
316  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
317  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
318  {
319  c = cj.pmadd(a,b,c);
320  }
321  };
322 
323  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
324  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
325  {
326  t = b; t = cj.pmul(a,t); c = padd(c,t);
327  }
328  };
329 
330  template<typename CJ, typename A, typename B, typename C, typename T>
331  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
332  {
333  gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
334  }
335 
336  #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
337 // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
338 #endif
339 
340 /* Vectorization logic
341  * real*real: unpack rhs to constant packets, ...
342  *
343  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
344  * storing each res packet into two packets (2x2),
345  * at the end combine them: swap the second and addsub them
346  * cf*cf : same but with 2x4 blocks
347  * cplx*real : unpack rhs to constant packets, ...
348  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
349  */
350 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
351 class gebp_traits
352 {
353 public:
354  typedef _LhsScalar LhsScalar;
355  typedef _RhsScalar RhsScalar;
356  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
357 
358  enum {
359  ConjLhs = _ConjLhs,
360  ConjRhs = _ConjRhs,
361  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
362  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
363  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
364  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
365 
366  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
367 
368  // register block size along the N direction must be 1 or 4
369  nr = 4,
370 
371  // register block size along the M direction (currently, this one cannot be modified)
372  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
373 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
374  // we assume 16 registers
375  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
376  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
377  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
378 #else
379  mr = default_mr,
380 #endif
381 
382  LhsProgress = LhsPacketSize,
383  RhsProgress = 1
384  };
385 
386  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
387  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
388  typedef typename packet_traits<ResScalar>::type _ResPacket;
389 
390  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
391  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
392  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
393 
394  typedef ResPacket AccPacket;
395 
396  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
397  {
398  p = pset1<ResPacket>(ResScalar(0));
399  }
400 
401  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
402  {
403  pbroadcast4(b, b0, b1, b2, b3);
404  }
405 
406 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
407 // {
408 // pbroadcast2(b, b0, b1);
409 // }
410 
411  template<typename RhsPacketType>
412  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
413  {
414  dest = pset1<RhsPacketType>(*b);
415  }
416 
417  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
418  {
419  dest = ploadquad<RhsPacket>(b);
420  }
421 
422  template<typename LhsPacketType>
423  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
424  {
425  dest = pload<LhsPacketType>(a);
426  }
427 
428  template<typename LhsPacketType>
429  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
430  {
431  dest = ploadu<LhsPacketType>(a);
432  }
433 
434  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
435  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
436  {
437  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
438  // let gcc allocate the register in which to store the result of the pmul
439  // (in the case where there is no FMA) gcc fails to figure out how to avoid
440  // spilling register.
441 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
442  EIGEN_UNUSED_VARIABLE(tmp);
443  c = pmadd(a,b,c);
444 #else
445  tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
446 #endif
447  }
448 
449  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
450  {
451  r = pmadd(c,alpha,r);
452  }
453 
454  template<typename ResPacketHalf>
455  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
456  {
457  r = pmadd(c,alpha,r);
458  }
459 
460 protected:
461 // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
462 // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
463 };
464 
465 template<typename RealScalar, bool _ConjLhs>
466 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
467 {
468 public:
469  typedef std::complex<RealScalar> LhsScalar;
470  typedef RealScalar RhsScalar;
471  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
472 
473  enum {
474  ConjLhs = _ConjLhs,
475  ConjRhs = false,
476  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
477  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
478  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
479  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
480 
481  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
482  nr = 4,
483 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
484  // we assume 16 registers
485  mr = 3*LhsPacketSize,
486 #else
487  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
488 #endif
489 
490  LhsProgress = LhsPacketSize,
491  RhsProgress = 1
492  };
493 
494  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
495  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
496  typedef typename packet_traits<ResScalar>::type _ResPacket;
497 
498  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
499  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
500  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
501 
502  typedef ResPacket AccPacket;
503 
504  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
505  {
506  p = pset1<ResPacket>(ResScalar(0));
507  }
508 
509  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
510  {
511  dest = pset1<RhsPacket>(*b);
512  }
513 
514  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
515  {
516  dest = pset1<RhsPacket>(*b);
517  }
518 
519  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
520  {
521  dest = pload<LhsPacket>(a);
522  }
523 
524  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
525  {
526  dest = ploadu<LhsPacket>(a);
527  }
528 
529  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
530  {
531  pbroadcast4(b, b0, b1, b2, b3);
532  }
533 
534 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
535 // {
536 // pbroadcast2(b, b0, b1);
537 // }
538 
539  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
540  {
541  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
542  }
543 
544  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
545  {
546 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
547  EIGEN_UNUSED_VARIABLE(tmp);
548  c.v = pmadd(a.v,b,c.v);
549 #else
550  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
551 #endif
552  }
553 
554  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
555  {
556  c += a * b;
557  }
558 
559  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
560  {
561  r = cj.pmadd(c,alpha,r);
562  }
563 
564 protected:
565  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
566 };
567 
568 template<typename Packet>
569 struct DoublePacket
570 {
571  Packet first;
572  Packet second;
573 };
574 
575 template<typename Packet>
576 DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
577 {
578  DoublePacket<Packet> res;
579  res.first = padd(a.first, b.first);
580  res.second = padd(a.second,b.second);
581  return res;
582 }
583 
584 template<typename Packet>
585 const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
586 {
587  return a;
588 }
589 
590 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
591 // template<typename Packet>
592 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
593 // {
594 // DoublePacket<Packet> res;
595 // res.first = padd(a.first, b.first);
596 // res.second = padd(a.second,b.second);
597 // return res;
598 // }
599 
600 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
601 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
602 {
603 public:
604  typedef std::complex<RealScalar> Scalar;
605  typedef std::complex<RealScalar> LhsScalar;
606  typedef std::complex<RealScalar> RhsScalar;
607  typedef std::complex<RealScalar> ResScalar;
608 
609  enum {
610  ConjLhs = _ConjLhs,
611  ConjRhs = _ConjRhs,
612  Vectorizable = packet_traits<RealScalar>::Vectorizable
613  && packet_traits<Scalar>::Vectorizable,
614  RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
615  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
616  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
617  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
618 
619  // FIXME: should depend on NumberOfRegisters
620  nr = 4,
621  mr = ResPacketSize,
622 
623  LhsProgress = ResPacketSize,
624  RhsProgress = 1
625  };
626 
627  typedef typename packet_traits<RealScalar>::type RealPacket;
628  typedef typename packet_traits<Scalar>::type ScalarPacket;
629  typedef DoublePacket<RealPacket> DoublePacketType;
630 
631  typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
632  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
633  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
634  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
635 
636  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
637 
638  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
639  {
640  p.first = pset1<RealPacket>(RealScalar(0));
641  p.second = pset1<RealPacket>(RealScalar(0));
642  }
643 
644  // Scalar path
645  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
646  {
647  dest = pset1<ResPacket>(*b);
648  }
649 
650  // Vectorized path
651  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
652  {
653  dest.first = pset1<RealPacket>(real(*b));
654  dest.second = pset1<RealPacket>(imag(*b));
655  }
656 
657  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
658  {
659  loadRhs(b,dest);
660  }
661  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
662  {
663  eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
664  loadRhs(b,dest);
665  }
666 
667  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
668  {
669  // FIXME not sure that's the best way to implement it!
670  loadRhs(b+0, b0);
671  loadRhs(b+1, b1);
672  loadRhs(b+2, b2);
673  loadRhs(b+3, b3);
674  }
675 
676  // Vectorized path
677  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
678  {
679  // FIXME not sure that's the best way to implement it!
680  loadRhs(b+0, b0);
681  loadRhs(b+1, b1);
682  }
683 
684  // Scalar path
685  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
686  {
687  // FIXME not sure that's the best way to implement it!
688  loadRhs(b+0, b0);
689  loadRhs(b+1, b1);
690  }
691 
692  // nothing special here
693  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
694  {
695  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
696  }
697 
698  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
699  {
700  dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
701  }
702 
703  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
704  {
705  c.first = padd(pmul(a,b.first), c.first);
706  c.second = padd(pmul(a,b.second),c.second);
707  }
708 
709  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
710  {
711  c = cj.pmadd(a,b,c);
712  }
713 
714  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
715 
716  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
717  {
718  // assemble c
719  ResPacket tmp;
720  if((!ConjLhs)&&(!ConjRhs))
721  {
722  tmp = pcplxflip(pconj(ResPacket(c.second)));
723  tmp = padd(ResPacket(c.first),tmp);
724  }
725  else if((!ConjLhs)&&(ConjRhs))
726  {
727  tmp = pconj(pcplxflip(ResPacket(c.second)));
728  tmp = padd(ResPacket(c.first),tmp);
729  }
730  else if((ConjLhs)&&(!ConjRhs))
731  {
732  tmp = pcplxflip(ResPacket(c.second));
733  tmp = padd(pconj(ResPacket(c.first)),tmp);
734  }
735  else if((ConjLhs)&&(ConjRhs))
736  {
737  tmp = pcplxflip(ResPacket(c.second));
738  tmp = psub(pconj(ResPacket(c.first)),tmp);
739  }
740 
741  r = pmadd(tmp,alpha,r);
742  }
743 
744 protected:
745  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
746 };
747 
748 template<typename RealScalar, bool _ConjRhs>
749 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
750 {
751 public:
752  typedef std::complex<RealScalar> Scalar;
753  typedef RealScalar LhsScalar;
754  typedef Scalar RhsScalar;
755  typedef Scalar ResScalar;
756 
757  enum {
758  ConjLhs = false,
759  ConjRhs = _ConjRhs,
760  Vectorizable = packet_traits<RealScalar>::Vectorizable
761  && packet_traits<Scalar>::Vectorizable,
762  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
763  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
764  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
765 
766  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
767  // FIXME: should depend on NumberOfRegisters
768  nr = 4,
769  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
770 
771  LhsProgress = ResPacketSize,
772  RhsProgress = 1
773  };
774 
775  typedef typename packet_traits<LhsScalar>::type _LhsPacket;
776  typedef typename packet_traits<RhsScalar>::type _RhsPacket;
777  typedef typename packet_traits<ResScalar>::type _ResPacket;
778 
779  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
780  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
781  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
782 
783  typedef ResPacket AccPacket;
784 
785  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
786  {
787  p = pset1<ResPacket>(ResScalar(0));
788  }
789 
790  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
791  {
792  dest = pset1<RhsPacket>(*b);
793  }
794 
795  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
796  {
797  pbroadcast4(b, b0, b1, b2, b3);
798  }
799 
800 // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
801 // {
802 // // FIXME not sure that's the best way to implement it!
803 // b0 = pload1<RhsPacket>(b+0);
804 // b1 = pload1<RhsPacket>(b+1);
805 // }
806 
807  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
808  {
809  dest = ploaddup<LhsPacket>(a);
810  }
811 
812  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
813  {
814  eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
815  loadRhs(b,dest);
816  }
817 
818  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
819  {
820  dest = ploaddup<LhsPacket>(a);
821  }
822 
823  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
824  {
825  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
826  }
827 
828  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
829  {
830 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
831  EIGEN_UNUSED_VARIABLE(tmp);
832  c.v = pmadd(a,b.v,c.v);
833 #else
834  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
835 #endif
836 
837  }
838 
839  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
840  {
841  c += a * b;
842  }
843 
844  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
845  {
846  r = cj.pmadd(alpha,c,r);
847  }
848 
849 protected:
850  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
851 };
852 
853 /* optimized GEneral packed Block * packed Panel product kernel
854  *
855  * Mixing type logic: C += A * B
856  * | A | B | comments
857  * |real |cplx | no vectorization yet, would require to pack A with duplication
858  * |cplx |real | easy vectorization
859  */
860 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
861 struct gebp_kernel
862 {
863  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
864  typedef typename Traits::ResScalar ResScalar;
865  typedef typename Traits::LhsPacket LhsPacket;
866  typedef typename Traits::RhsPacket RhsPacket;
867  typedef typename Traits::ResPacket ResPacket;
868  typedef typename Traits::AccPacket AccPacket;
869 
870  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
871  typedef typename SwappedTraits::ResScalar SResScalar;
872  typedef typename SwappedTraits::LhsPacket SLhsPacket;
873  typedef typename SwappedTraits::RhsPacket SRhsPacket;
874  typedef typename SwappedTraits::ResPacket SResPacket;
875  typedef typename SwappedTraits::AccPacket SAccPacket;
876 
877  typedef typename DataMapper::LinearMapper LinearMapper;
878 
879  enum {
880  Vectorizable = Traits::Vectorizable,
881  LhsProgress = Traits::LhsProgress,
882  RhsProgress = Traits::RhsProgress,
883  ResPacketSize = Traits::ResPacketSize
884  };
885 
886  EIGEN_DONT_INLINE
887  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
888  Index rows, Index depth, Index cols, ResScalar alpha,
889  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
890 };
891 
892 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
893 EIGEN_DONT_INLINE
894 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
895  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
896  Index rows, Index depth, Index cols, ResScalar alpha,
897  Index strideA, Index strideB, Index offsetA, Index offsetB)
898  {
899  Traits traits;
900  SwappedTraits straits;
901 
902  if(strideA==-1) strideA = depth;
903  if(strideB==-1) strideB = depth;
904  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
905  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
906  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
907  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
908  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
909  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
910  const Index peeled_kc = depth & ~(pk-1);
911  const Index prefetch_res_offset = 32/sizeof(ResScalar);
912 // const Index depth2 = depth & ~1;
913 
914  //---------- Process 3 * LhsProgress rows at once ----------
915  // This corresponds to 3*LhsProgress x nr register blocks.
916  // Usually, make sense only with FMA
917  if(mr>=3*Traits::LhsProgress)
918  {
919  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
920  // and on each largest micro vertical panel of the rhs (depth * nr).
921  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
922  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
923  // This actual number of rows is computed as follow:
924  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
925  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
926  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
927  // or because we are testing specific blocking sizes.
928  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
929  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
930  {
931  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
932  for(Index j2=0; j2<packet_cols4; j2+=nr)
933  {
934  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
935  {
936 
937  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
938  // stored into 3 x nr registers.
939 
940  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
941  prefetch(&blA[0]);
942 
943  // gets res block as register
944  AccPacket C0, C1, C2, C3,
945  C4, C5, C6, C7,
946  C8, C9, C10, C11;
947  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
948  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
949  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
950 
951  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
952  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
953  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
954  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
955 
956  r0.prefetch(0);
957  r1.prefetch(0);
958  r2.prefetch(0);
959  r3.prefetch(0);
960 
961  // performs "inner" products
962  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
963  prefetch(&blB[0]);
964  LhsPacket A0, A1;
965 
966  for(Index k=0; k<peeled_kc; k+=pk)
967  {
968  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
969  RhsPacket B_0, T0;
970  LhsPacket A2;
971 
972 #define EIGEN_GEBP_ONESTEP(K) \
973  do { \
974  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
975  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
976  internal::prefetch(blA+(3*K+16)*LhsProgress); \
977  if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
978  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
979  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
980  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
981  traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
982  traits.madd(A0, B_0, C0, T0); \
983  traits.madd(A1, B_0, C4, T0); \
984  traits.madd(A2, B_0, C8, B_0); \
985  traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
986  traits.madd(A0, B_0, C1, T0); \
987  traits.madd(A1, B_0, C5, T0); \
988  traits.madd(A2, B_0, C9, B_0); \
989  traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
990  traits.madd(A0, B_0, C2, T0); \
991  traits.madd(A1, B_0, C6, T0); \
992  traits.madd(A2, B_0, C10, B_0); \
993  traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
994  traits.madd(A0, B_0, C3 , T0); \
995  traits.madd(A1, B_0, C7, T0); \
996  traits.madd(A2, B_0, C11, B_0); \
997  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
998  } while(false)
999 
1000  internal::prefetch(blB);
1001  EIGEN_GEBP_ONESTEP(0);
1002  EIGEN_GEBP_ONESTEP(1);
1003  EIGEN_GEBP_ONESTEP(2);
1004  EIGEN_GEBP_ONESTEP(3);
1005  EIGEN_GEBP_ONESTEP(4);
1006  EIGEN_GEBP_ONESTEP(5);
1007  EIGEN_GEBP_ONESTEP(6);
1008  EIGEN_GEBP_ONESTEP(7);
1009 
1010  blB += pk*4*RhsProgress;
1011  blA += pk*3*Traits::LhsProgress;
1012 
1013  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1014  }
1015  // process remaining peeled loop
1016  for(Index k=peeled_kc; k<depth; k++)
1017  {
1018  RhsPacket B_0, T0;
1019  LhsPacket A2;
1020  EIGEN_GEBP_ONESTEP(0);
1021  blB += 4*RhsProgress;
1022  blA += 3*Traits::LhsProgress;
1023  }
1024 
1025 #undef EIGEN_GEBP_ONESTEP
1026 
1027  ResPacket R0, R1, R2;
1028  ResPacket alphav = pset1<ResPacket>(alpha);
1029 
1030  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1031  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1032  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1033  traits.acc(C0, alphav, R0);
1034  traits.acc(C4, alphav, R1);
1035  traits.acc(C8, alphav, R2);
1036  r0.storePacket(0 * Traits::ResPacketSize, R0);
1037  r0.storePacket(1 * Traits::ResPacketSize, R1);
1038  r0.storePacket(2 * Traits::ResPacketSize, R2);
1039 
1040  R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1041  R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1042  R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1043  traits.acc(C1, alphav, R0);
1044  traits.acc(C5, alphav, R1);
1045  traits.acc(C9, alphav, R2);
1046  r1.storePacket(0 * Traits::ResPacketSize, R0);
1047  r1.storePacket(1 * Traits::ResPacketSize, R1);
1048  r1.storePacket(2 * Traits::ResPacketSize, R2);
1049 
1050  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1051  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1052  R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1053  traits.acc(C2, alphav, R0);
1054  traits.acc(C6, alphav, R1);
1055  traits.acc(C10, alphav, R2);
1056  r2.storePacket(0 * Traits::ResPacketSize, R0);
1057  r2.storePacket(1 * Traits::ResPacketSize, R1);
1058  r2.storePacket(2 * Traits::ResPacketSize, R2);
1059 
1060  R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1061  R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1062  R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1063  traits.acc(C3, alphav, R0);
1064  traits.acc(C7, alphav, R1);
1065  traits.acc(C11, alphav, R2);
1066  r3.storePacket(0 * Traits::ResPacketSize, R0);
1067  r3.storePacket(1 * Traits::ResPacketSize, R1);
1068  r3.storePacket(2 * Traits::ResPacketSize, R2);
1069  }
1070  }
1071 
1072  // Deal with remaining columns of the rhs
1073  for(Index j2=packet_cols4; j2<cols; j2++)
1074  {
1075  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1076  {
1077  // One column at a time
1078  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1079  prefetch(&blA[0]);
1080 
1081  // gets res block as register
1082  AccPacket C0, C4, C8;
1083  traits.initAcc(C0);
1084  traits.initAcc(C4);
1085  traits.initAcc(C8);
1086 
1087  LinearMapper r0 = res.getLinearMapper(i, j2);
1088  r0.prefetch(0);
1089 
1090  // performs "inner" products
1091  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1092  LhsPacket A0, A1, A2;
1093 
1094  for(Index k=0; k<peeled_kc; k+=pk)
1095  {
1096  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1097  RhsPacket B_0;
1098 #define EIGEN_GEBGP_ONESTEP(K) \
1099  do { \
1100  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1101  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1102  traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
1103  traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
1104  traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
1105  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1106  traits.madd(A0, B_0, C0, B_0); \
1107  traits.madd(A1, B_0, C4, B_0); \
1108  traits.madd(A2, B_0, C8, B_0); \
1109  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1110  } while(false)
1111 
1112  EIGEN_GEBGP_ONESTEP(0);
1113  EIGEN_GEBGP_ONESTEP(1);
1114  EIGEN_GEBGP_ONESTEP(2);
1115  EIGEN_GEBGP_ONESTEP(3);
1116  EIGEN_GEBGP_ONESTEP(4);
1117  EIGEN_GEBGP_ONESTEP(5);
1118  EIGEN_GEBGP_ONESTEP(6);
1119  EIGEN_GEBGP_ONESTEP(7);
1120 
1121  blB += pk*RhsProgress;
1122  blA += pk*3*Traits::LhsProgress;
1123 
1124  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1125  }
1126 
1127  // process remaining peeled loop
1128  for(Index k=peeled_kc; k<depth; k++)
1129  {
1130  RhsPacket B_0;
1131  EIGEN_GEBGP_ONESTEP(0);
1132  blB += RhsProgress;
1133  blA += 3*Traits::LhsProgress;
1134  }
1135 #undef EIGEN_GEBGP_ONESTEP
1136  ResPacket R0, R1, R2;
1137  ResPacket alphav = pset1<ResPacket>(alpha);
1138 
1139  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1140  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1141  R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1142  traits.acc(C0, alphav, R0);
1143  traits.acc(C4, alphav, R1);
1144  traits.acc(C8, alphav, R2);
1145  r0.storePacket(0 * Traits::ResPacketSize, R0);
1146  r0.storePacket(1 * Traits::ResPacketSize, R1);
1147  r0.storePacket(2 * Traits::ResPacketSize, R2);
1148  }
1149  }
1150  }
1151  }
1152 
1153  //---------- Process 2 * LhsProgress rows at once ----------
1154  if(mr>=2*Traits::LhsProgress)
1155  {
1156  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1157  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1158  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1159  // or because we are testing specific blocking sizes.
1160  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1161 
1162  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1163  {
1164  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1165  for(Index j2=0; j2<packet_cols4; j2+=nr)
1166  {
1167  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1168  {
1169 
1170  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1171  // stored into 2 x nr registers.
1172 
1173  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1174  prefetch(&blA[0]);
1175 
1176  // gets res block as register
1177  AccPacket C0, C1, C2, C3,
1178  C4, C5, C6, C7;
1179  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1180  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1181 
1182  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1183  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1184  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1185  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1186 
1187  r0.prefetch(prefetch_res_offset);
1188  r1.prefetch(prefetch_res_offset);
1189  r2.prefetch(prefetch_res_offset);
1190  r3.prefetch(prefetch_res_offset);
1191 
1192  // performs "inner" products
1193  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1194  prefetch(&blB[0]);
1195  LhsPacket A0, A1;
1196 
1197  for(Index k=0; k<peeled_kc; k+=pk)
1198  {
1199  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1200  RhsPacket B_0, B1, B2, B3, T0;
1201 
1202  #define EIGEN_GEBGP_ONESTEP(K) \
1203  do { \
1204  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1205  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1206  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1207  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1208  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1209  traits.madd(A0, B_0, C0, T0); \
1210  traits.madd(A1, B_0, C4, B_0); \
1211  traits.madd(A0, B1, C1, T0); \
1212  traits.madd(A1, B1, C5, B1); \
1213  traits.madd(A0, B2, C2, T0); \
1214  traits.madd(A1, B2, C6, B2); \
1215  traits.madd(A0, B3, C3, T0); \
1216  traits.madd(A1, B3, C7, B3); \
1217  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1218  } while(false)
1219 
1220  internal::prefetch(blB+(48+0));
1221  EIGEN_GEBGP_ONESTEP(0);
1222  EIGEN_GEBGP_ONESTEP(1);
1223  EIGEN_GEBGP_ONESTEP(2);
1224  EIGEN_GEBGP_ONESTEP(3);
1225  internal::prefetch(blB+(48+16));
1226  EIGEN_GEBGP_ONESTEP(4);
1227  EIGEN_GEBGP_ONESTEP(5);
1228  EIGEN_GEBGP_ONESTEP(6);
1229  EIGEN_GEBGP_ONESTEP(7);
1230 
1231  blB += pk*4*RhsProgress;
1232  blA += pk*(2*Traits::LhsProgress);
1233 
1234  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1235  }
1236  // process remaining peeled loop
1237  for(Index k=peeled_kc; k<depth; k++)
1238  {
1239  RhsPacket B_0, B1, B2, B3, T0;
1240  EIGEN_GEBGP_ONESTEP(0);
1241  blB += 4*RhsProgress;
1242  blA += 2*Traits::LhsProgress;
1243  }
1244 #undef EIGEN_GEBGP_ONESTEP
1245 
1246  ResPacket R0, R1, R2, R3;
1247  ResPacket alphav = pset1<ResPacket>(alpha);
1248 
1249  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1250  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1251  R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1252  R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1253  traits.acc(C0, alphav, R0);
1254  traits.acc(C4, alphav, R1);
1255  traits.acc(C1, alphav, R2);
1256  traits.acc(C5, alphav, R3);
1257  r0.storePacket(0 * Traits::ResPacketSize, R0);
1258  r0.storePacket(1 * Traits::ResPacketSize, R1);
1259  r1.storePacket(0 * Traits::ResPacketSize, R2);
1260  r1.storePacket(1 * Traits::ResPacketSize, R3);
1261 
1262  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1263  R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1264  R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1265  R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1266  traits.acc(C2, alphav, R0);
1267  traits.acc(C6, alphav, R1);
1268  traits.acc(C3, alphav, R2);
1269  traits.acc(C7, alphav, R3);
1270  r2.storePacket(0 * Traits::ResPacketSize, R0);
1271  r2.storePacket(1 * Traits::ResPacketSize, R1);
1272  r3.storePacket(0 * Traits::ResPacketSize, R2);
1273  r3.storePacket(1 * Traits::ResPacketSize, R3);
1274  }
1275  }
1276 
1277  // Deal with remaining columns of the rhs
1278  for(Index j2=packet_cols4; j2<cols; j2++)
1279  {
1280  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1281  {
1282  // One column at a time
1283  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1284  prefetch(&blA[0]);
1285 
1286  // gets res block as register
1287  AccPacket C0, C4;
1288  traits.initAcc(C0);
1289  traits.initAcc(C4);
1290 
1291  LinearMapper r0 = res.getLinearMapper(i, j2);
1292  r0.prefetch(prefetch_res_offset);
1293 
1294  // performs "inner" products
1295  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1296  LhsPacket A0, A1;
1297 
1298  for(Index k=0; k<peeled_kc; k+=pk)
1299  {
1300  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
1301  RhsPacket B_0, B1;
1302 
1303 #define EIGEN_GEBGP_ONESTEP(K) \
1304  do { \
1305  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1306  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1307  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1308  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1309  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1310  traits.madd(A0, B_0, C0, B1); \
1311  traits.madd(A1, B_0, C4, B_0); \
1312  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1313  } while(false)
1314 
1315  EIGEN_GEBGP_ONESTEP(0);
1316  EIGEN_GEBGP_ONESTEP(1);
1317  EIGEN_GEBGP_ONESTEP(2);
1318  EIGEN_GEBGP_ONESTEP(3);
1319  EIGEN_GEBGP_ONESTEP(4);
1320  EIGEN_GEBGP_ONESTEP(5);
1321  EIGEN_GEBGP_ONESTEP(6);
1322  EIGEN_GEBGP_ONESTEP(7);
1323 
1324  blB += pk*RhsProgress;
1325  blA += pk*2*Traits::LhsProgress;
1326 
1327  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
1328  }
1329 
1330  // process remaining peeled loop
1331  for(Index k=peeled_kc; k<depth; k++)
1332  {
1333  RhsPacket B_0, B1;
1334  EIGEN_GEBGP_ONESTEP(0);
1335  blB += RhsProgress;
1336  blA += 2*Traits::LhsProgress;
1337  }
1338 #undef EIGEN_GEBGP_ONESTEP
1339  ResPacket R0, R1;
1340  ResPacket alphav = pset1<ResPacket>(alpha);
1341 
1342  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1343  R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1344  traits.acc(C0, alphav, R0);
1345  traits.acc(C4, alphav, R1);
1346  r0.storePacket(0 * Traits::ResPacketSize, R0);
1347  r0.storePacket(1 * Traits::ResPacketSize, R1);
1348  }
1349  }
1350  }
1351  }
1352  //---------- Process 1 * LhsProgress rows at once ----------
1353  if(mr>=1*Traits::LhsProgress)
1354  {
1355  // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
1356  for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1357  {
1358  // loops on each largest micro vertical panel of rhs (depth * nr)
1359  for(Index j2=0; j2<packet_cols4; j2+=nr)
1360  {
1361  // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
1362  // stored into 1 x nr registers.
1363 
1364  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1365  prefetch(&blA[0]);
1366 
1367  // gets res block as register
1368  AccPacket C0, C1, C2, C3;
1369  traits.initAcc(C0);
1370  traits.initAcc(C1);
1371  traits.initAcc(C2);
1372  traits.initAcc(C3);
1373 
1374  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1375  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1376  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1377  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1378 
1379  r0.prefetch(prefetch_res_offset);
1380  r1.prefetch(prefetch_res_offset);
1381  r2.prefetch(prefetch_res_offset);
1382  r3.prefetch(prefetch_res_offset);
1383 
1384  // performs "inner" products
1385  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1386  prefetch(&blB[0]);
1387  LhsPacket A0;
1388 
1389  for(Index k=0; k<peeled_kc; k+=pk)
1390  {
1391  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
1392  RhsPacket B_0, B1, B2, B3;
1393 
1394 #define EIGEN_GEBGP_ONESTEP(K) \
1395  do { \
1396  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
1397  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1398  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1399  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \
1400  traits.madd(A0, B_0, C0, B_0); \
1401  traits.madd(A0, B1, C1, B1); \
1402  traits.madd(A0, B2, C2, B2); \
1403  traits.madd(A0, B3, C3, B3); \
1404  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
1405  } while(false)
1406 
1407  internal::prefetch(blB+(48+0));
1408  EIGEN_GEBGP_ONESTEP(0);
1409  EIGEN_GEBGP_ONESTEP(1);
1410  EIGEN_GEBGP_ONESTEP(2);
1411  EIGEN_GEBGP_ONESTEP(3);
1412  internal::prefetch(blB+(48+16));
1413  EIGEN_GEBGP_ONESTEP(4);
1414  EIGEN_GEBGP_ONESTEP(5);
1415  EIGEN_GEBGP_ONESTEP(6);
1416  EIGEN_GEBGP_ONESTEP(7);
1417 
1418  blB += pk*4*RhsProgress;
1419  blA += pk*1*LhsProgress;
1420 
1421  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
1422  }
1423  // process remaining peeled loop
1424  for(Index k=peeled_kc; k<depth; k++)
1425  {
1426  RhsPacket B_0, B1, B2, B3;
1427  EIGEN_GEBGP_ONESTEP(0);
1428  blB += 4*RhsProgress;
1429  blA += 1*LhsProgress;
1430  }
1431 #undef EIGEN_GEBGP_ONESTEP
1432 
1433  ResPacket R0, R1;
1434  ResPacket alphav = pset1<ResPacket>(alpha);
1435 
1436  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1437  R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1438  traits.acc(C0, alphav, R0);
1439  traits.acc(C1, alphav, R1);
1440  r0.storePacket(0 * Traits::ResPacketSize, R0);
1441  r1.storePacket(0 * Traits::ResPacketSize, R1);
1442 
1443  R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1444  R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1445  traits.acc(C2, alphav, R0);
1446  traits.acc(C3, alphav, R1);
1447  r2.storePacket(0 * Traits::ResPacketSize, R0);
1448  r3.storePacket(0 * Traits::ResPacketSize, R1);
1449  }
1450 
1451  // Deal with remaining columns of the rhs
1452  for(Index j2=packet_cols4; j2<cols; j2++)
1453  {
1454  // One column at a time
1455  const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1456  prefetch(&blA[0]);
1457 
1458  // gets res block as register
1459  AccPacket C0;
1460  traits.initAcc(C0);
1461 
1462  LinearMapper r0 = res.getLinearMapper(i, j2);
1463 
1464  // performs "inner" products
1465  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1466  LhsPacket A0;
1467 
1468  for(Index k=0; k<peeled_kc; k+=pk)
1469  {
1470  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
1471  RhsPacket B_0;
1472 
1473 #define EIGEN_GEBGP_ONESTEP(K) \
1474  do { \
1475  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
1476  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1477  traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
1478  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1479  traits.madd(A0, B_0, C0, B_0); \
1480  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
1481  } while(false);
1482 
1483  EIGEN_GEBGP_ONESTEP(0);
1484  EIGEN_GEBGP_ONESTEP(1);
1485  EIGEN_GEBGP_ONESTEP(2);
1486  EIGEN_GEBGP_ONESTEP(3);
1487  EIGEN_GEBGP_ONESTEP(4);
1488  EIGEN_GEBGP_ONESTEP(5);
1489  EIGEN_GEBGP_ONESTEP(6);
1490  EIGEN_GEBGP_ONESTEP(7);
1491 
1492  blB += pk*RhsProgress;
1493  blA += pk*1*Traits::LhsProgress;
1494 
1495  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
1496  }
1497 
1498  // process remaining peeled loop
1499  for(Index k=peeled_kc; k<depth; k++)
1500  {
1501  RhsPacket B_0;
1502  EIGEN_GEBGP_ONESTEP(0);
1503  blB += RhsProgress;
1504  blA += 1*Traits::LhsProgress;
1505  }
1506 #undef EIGEN_GEBGP_ONESTEP
1507  ResPacket R0;
1508  ResPacket alphav = pset1<ResPacket>(alpha);
1509  R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1510  traits.acc(C0, alphav, R0);
1511  r0.storePacket(0 * Traits::ResPacketSize, R0);
1512  }
1513  }
1514  }
1515  //---------- Process remaining rows, 1 at once ----------
1516  if(peeled_mc1<rows)
1517  {
1518  // loop on each panel of the rhs
1519  for(Index j2=0; j2<packet_cols4; j2+=nr)
1520  {
1521  // loop on each row of the lhs (1*LhsProgress x depth)
1522  for(Index i=peeled_mc1; i<rows; i+=1)
1523  {
1524  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1525  prefetch(&blA[0]);
1526  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1527 
1528  // The following piece of code wont work for 512 bit registers
1529  // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
1530  // as nr (which is currently 4) for the return type.
1531  typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
1532  if ((SwappedTraits::LhsProgress % 4) == 0 &&
1533  (SwappedTraits::LhsProgress <= 8) &&
1534  (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
1535  {
1536  SAccPacket C0, C1, C2, C3;
1537  straits.initAcc(C0);
1538  straits.initAcc(C1);
1539  straits.initAcc(C2);
1540  straits.initAcc(C3);
1541 
1542  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1543  const Index endk = (depth/spk)*spk;
1544  const Index endk4 = (depth/(spk*4))*(spk*4);
1545 
1546  Index k=0;
1547  for(; k<endk4; k+=4*spk)
1548  {
1549  SLhsPacket A0,A1;
1550  SRhsPacket B_0,B_1;
1551 
1552  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1553  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1554 
1555  straits.loadRhsQuad(blA+0*spk, B_0);
1556  straits.loadRhsQuad(blA+1*spk, B_1);
1557  straits.madd(A0,B_0,C0,B_0);
1558  straits.madd(A1,B_1,C1,B_1);
1559 
1560  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1561  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1562  straits.loadRhsQuad(blA+2*spk, B_0);
1563  straits.loadRhsQuad(blA+3*spk, B_1);
1564  straits.madd(A0,B_0,C2,B_0);
1565  straits.madd(A1,B_1,C3,B_1);
1566 
1567  blB += 4*SwappedTraits::LhsProgress;
1568  blA += 4*spk;
1569  }
1570  C0 = padd(padd(C0,C1),padd(C2,C3));
1571  for(; k<endk; k+=spk)
1572  {
1573  SLhsPacket A0;
1574  SRhsPacket B_0;
1575 
1576  straits.loadLhsUnaligned(blB, A0);
1577  straits.loadRhsQuad(blA, B_0);
1578  straits.madd(A0,B_0,C0,B_0);
1579 
1580  blB += SwappedTraits::LhsProgress;
1581  blA += spk;
1582  }
1583  if(SwappedTraits::LhsProgress==8)
1584  {
1585  // Special case where we have to first reduce the accumulation register C0
1586  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1587  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1588  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1589  typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1590 
1591  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1592  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1593 
1594  if(depth-endk>0)
1595  {
1596  // We have to handle the last row of the rhs which corresponds to a half-packet
1597  SLhsPacketHalf a0;
1598  SRhsPacketHalf b0;
1599  straits.loadLhsUnaligned(blB, a0);
1600  straits.loadRhs(blA, b0);
1601  SAccPacketHalf c0 = predux4(C0);
1602  straits.madd(a0,b0,c0,b0);
1603  straits.acc(c0, alphav, R);
1604  }
1605  else
1606  {
1607  straits.acc(predux4(C0), alphav, R);
1608  }
1609  res.scatterPacket(i, j2, R);
1610  }
1611  else
1612  {
1613  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1614  SResPacket alphav = pset1<SResPacket>(alpha);
1615  straits.acc(C0, alphav, R);
1616  res.scatterPacket(i, j2, R);
1617  }
1618  }
1619  else // scalar path
1620  {
1621  // get a 1 x 4 res block as registers
1622  ResScalar C0(0), C1(0), C2(0), C3(0);
1623 
1624  for(Index k=0; k<depth; k++)
1625  {
1626  LhsScalar A0;
1627  RhsScalar B_0, B_1;
1628 
1629  A0 = blA[k];
1630 
1631  B_0 = blB[0];
1632  B_1 = blB[1];
1633  CJMADD(cj,A0,B_0,C0, B_0);
1634  CJMADD(cj,A0,B_1,C1, B_1);
1635 
1636  B_0 = blB[2];
1637  B_1 = blB[3];
1638  CJMADD(cj,A0,B_0,C2, B_0);
1639  CJMADD(cj,A0,B_1,C3, B_1);
1640 
1641  blB += 4;
1642  }
1643  res(i, j2 + 0) += alpha * C0;
1644  res(i, j2 + 1) += alpha * C1;
1645  res(i, j2 + 2) += alpha * C2;
1646  res(i, j2 + 3) += alpha * C3;
1647  }
1648  }
1649  }
1650  // remaining columns
1651  for(Index j2=packet_cols4; j2<cols; j2++)
1652  {
1653  // loop on each row of the lhs (1*LhsProgress x depth)
1654  for(Index i=peeled_mc1; i<rows; i+=1)
1655  {
1656  const LhsScalar* blA = &blockA[i*strideA+offsetA];
1657  prefetch(&blA[0]);
1658  // gets a 1 x 1 res block as registers
1659  ResScalar C0(0);
1660  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1661  for(Index k=0; k<depth; k++)
1662  {
1663  LhsScalar A0 = blA[k];
1664  RhsScalar B_0 = blB[k];
1665  CJMADD(cj, A0, B_0, C0, B_0);
1666  }
1667  res(i, j2) += alpha * C0;
1668  }
1669  }
1670  }
1671  }
1672 
1673 
1674 #undef CJMADD
1675 
1676 // pack a block of the lhs
1677 // The traversal is as follow (mr==4):
1678 // 0 4 8 12 ...
1679 // 1 5 9 13 ...
1680 // 2 6 10 14 ...
1681 // 3 7 11 15 ...
1682 //
1683 // 16 20 24 28 ...
1684 // 17 21 25 29 ...
1685 // 18 22 26 30 ...
1686 // 19 23 27 31 ...
1687 //
1688 // 32 33 34 35 ...
1689 // 36 36 38 39 ...
1690 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1691 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1692 {
1693  typedef typename DataMapper::LinearMapper LinearMapper;
1694  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1695 };
1696 
1697 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1698 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1699  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1700 {
1701  typedef typename packet_traits<Scalar>::type Packet;
1702  enum { PacketSize = packet_traits<Scalar>::size };
1703 
1704  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1705  EIGEN_UNUSED_VARIABLE(stride);
1706  EIGEN_UNUSED_VARIABLE(offset);
1707  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1708  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1709  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1710  Index count = 0;
1711 
1712  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1713  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1714  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1715  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1716  : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1717 
1718  Index i=0;
1719 
1720  // Pack 3 packets
1721  if(Pack1>=3*PacketSize)
1722  {
1723  for(; i<peeled_mc3; i+=3*PacketSize)
1724  {
1725  if(PanelMode) count += (3*PacketSize) * offset;
1726 
1727  for(Index k=0; k<depth; k++)
1728  {
1729  Packet A, B, C;
1730  A = lhs.loadPacket(i+0*PacketSize, k);
1731  B = lhs.loadPacket(i+1*PacketSize, k);
1732  C = lhs.loadPacket(i+2*PacketSize, k);
1733  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1734  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1735  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1736  }
1737  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1738  }
1739  }
1740  // Pack 2 packets
1741  if(Pack1>=2*PacketSize)
1742  {
1743  for(; i<peeled_mc2; i+=2*PacketSize)
1744  {
1745  if(PanelMode) count += (2*PacketSize) * offset;
1746 
1747  for(Index k=0; k<depth; k++)
1748  {
1749  Packet A, B;
1750  A = lhs.loadPacket(i+0*PacketSize, k);
1751  B = lhs.loadPacket(i+1*PacketSize, k);
1752  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1753  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1754  }
1755  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1756  }
1757  }
1758  // Pack 1 packets
1759  if(Pack1>=1*PacketSize)
1760  {
1761  for(; i<peeled_mc1; i+=1*PacketSize)
1762  {
1763  if(PanelMode) count += (1*PacketSize) * offset;
1764 
1765  for(Index k=0; k<depth; k++)
1766  {
1767  Packet A;
1768  A = lhs.loadPacket(i+0*PacketSize, k);
1769  pstore(blockA+count, cj.pconj(A));
1770  count+=PacketSize;
1771  }
1772  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1773  }
1774  }
1775  // Pack scalars
1776  if(Pack2<PacketSize && Pack2>1)
1777  {
1778  for(; i<peeled_mc0; i+=Pack2)
1779  {
1780  if(PanelMode) count += Pack2 * offset;
1781 
1782  for(Index k=0; k<depth; k++)
1783  for(Index w=0; w<Pack2; w++)
1784  blockA[count++] = cj(lhs(i+w, k));
1785 
1786  if(PanelMode) count += Pack2 * (stride-offset-depth);
1787  }
1788  }
1789  for(; i<rows; i++)
1790  {
1791  if(PanelMode) count += offset;
1792  for(Index k=0; k<depth; k++)
1793  blockA[count++] = cj(lhs(i, k));
1794  if(PanelMode) count += (stride-offset-depth);
1795  }
1796 }
1797 
1798 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1799 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1800 {
1801  typedef typename DataMapper::LinearMapper LinearMapper;
1802  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
1803 };
1804 
1805 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
1806 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1807  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1808 {
1809  typedef typename packet_traits<Scalar>::type Packet;
1810  enum { PacketSize = packet_traits<Scalar>::size };
1811 
1812  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
1813  EIGEN_UNUSED_VARIABLE(stride);
1814  EIGEN_UNUSED_VARIABLE(offset);
1815  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1816  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1817  Index count = 0;
1818 
1819 // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1820 // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1821 // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1822 
1823  int pack = Pack1;
1824  Index i = 0;
1825  while(pack>0)
1826  {
1827  Index remaining_rows = rows-i;
1828  Index peeled_mc = i+(remaining_rows/pack)*pack;
1829  for(; i<peeled_mc; i+=pack)
1830  {
1831  if(PanelMode) count += pack * offset;
1832 
1833  const Index peeled_k = (depth/PacketSize)*PacketSize;
1834  Index k=0;
1835  if(pack>=PacketSize)
1836  {
1837  for(; k<peeled_k; k+=PacketSize)
1838  {
1839  for (Index m = 0; m < pack; m += PacketSize)
1840  {
1841  PacketBlock<Packet> kernel;
1842  for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1843  ptranspose(kernel);
1844  for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
1845  }
1846  count += PacketSize*pack;
1847  }
1848  }
1849  for(; k<depth; k++)
1850  {
1851  Index w=0;
1852  for(; w<pack-3; w+=4)
1853  {
1854  Scalar a(cj(lhs(i+w+0, k))),
1855  b(cj(lhs(i+w+1, k))),
1856  c(cj(lhs(i+w+2, k))),
1857  d(cj(lhs(i+w+3, k)));
1858  blockA[count++] = a;
1859  blockA[count++] = b;
1860  blockA[count++] = c;
1861  blockA[count++] = d;
1862  }
1863  if(pack%4)
1864  for(;w<pack;++w)
1865  blockA[count++] = cj(lhs(i+w, k));
1866  }
1867 
1868  if(PanelMode) count += pack * (stride-offset-depth);
1869  }
1870 
1871  pack -= PacketSize;
1872  if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1873  pack = Pack2;
1874  }
1875 
1876  for(; i<rows; i++)
1877  {
1878  if(PanelMode) count += offset;
1879  for(Index k=0; k<depth; k++)
1880  blockA[count++] = cj(lhs(i, k));
1881  if(PanelMode) count += (stride-offset-depth);
1882  }
1883 }
1884 
1885 // copy a complete panel of the rhs
1886 // this version is optimized for column major matrices
1887 // The traversal order is as follow: (nr==4):
1888 // 0 1 2 3 12 13 14 15 24 27
1889 // 4 5 6 7 16 17 18 19 25 28
1890 // 8 9 10 11 20 21 22 23 26 29
1891 // . . . . . . . . . .
1892 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1893 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1894 {
1895  typedef typename packet_traits<Scalar>::type Packet;
1896  typedef typename DataMapper::LinearMapper LinearMapper;
1897  enum { PacketSize = packet_traits<Scalar>::size };
1898  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
1899 };
1900 
1901 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
1902 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1903  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
1904 {
1905  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
1906  EIGEN_UNUSED_VARIABLE(stride);
1907  EIGEN_UNUSED_VARIABLE(offset);
1908  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1909  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1910  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1911  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1912  Index count = 0;
1913  const Index peeled_k = (depth/PacketSize)*PacketSize;
1914 // if(nr>=8)
1915 // {
1916 // for(Index j2=0; j2<packet_cols8; j2+=8)
1917 // {
1918 // // skip what we have before
1919 // if(PanelMode) count += 8 * offset;
1920 // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
1921 // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
1922 // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
1923 // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
1924 // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
1925 // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
1926 // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
1927 // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
1928 // Index k=0;
1929 // if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
1930 // {
1931 // for(; k<peeled_k; k+=PacketSize) {
1932 // PacketBlock<Packet> kernel;
1933 // for (int p = 0; p < PacketSize; ++p) {
1934 // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
1935 // }
1936 // ptranspose(kernel);
1937 // for (int p = 0; p < PacketSize; ++p) {
1938 // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
1939 // count+=PacketSize;
1940 // }
1941 // }
1942 // }
1943 // for(; k<depth; k++)
1944 // {
1945 // blockB[count+0] = cj(b0[k]);
1946 // blockB[count+1] = cj(b1[k]);
1947 // blockB[count+2] = cj(b2[k]);
1948 // blockB[count+3] = cj(b3[k]);
1949 // blockB[count+4] = cj(b4[k]);
1950 // blockB[count+5] = cj(b5[k]);
1951 // blockB[count+6] = cj(b6[k]);
1952 // blockB[count+7] = cj(b7[k]);
1953 // count += 8;
1954 // }
1955 // // skip what we have after
1956 // if(PanelMode) count += 8 * (stride-offset-depth);
1957 // }
1958 // }
1959 
1960  if(nr>=4)
1961  {
1962  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1963  {
1964  // skip what we have before
1965  if(PanelMode) count += 4 * offset;
1966  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1967  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1968  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1969  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1970 
1971  Index k=0;
1972  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
1973  {
1974  for(; k<peeled_k; k+=PacketSize) {
1975  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1976  kernel.packet[0] = dm0.loadPacket(k);
1977  kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1978  kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1979  kernel.packet[3%PacketSize] = dm3.loadPacket(k);
1980  ptranspose(kernel);
1981  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1982  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1983  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1984  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1985  count+=4*PacketSize;
1986  }
1987  }
1988  for(; k<depth; k++)
1989  {
1990  blockB[count+0] = cj(dm0(k));
1991  blockB[count+1] = cj(dm1(k));
1992  blockB[count+2] = cj(dm2(k));
1993  blockB[count+3] = cj(dm3(k));
1994  count += 4;
1995  }
1996  // skip what we have after
1997  if(PanelMode) count += 4 * (stride-offset-depth);
1998  }
1999  }
2000 
2001  // copy the remaining columns one at a time (nr==1)
2002  for(Index j2=packet_cols4; j2<cols; ++j2)
2003  {
2004  if(PanelMode) count += offset;
2005  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2006  for(Index k=0; k<depth; k++)
2007  {
2008  blockB[count] = cj(dm0(k));
2009  count += 1;
2010  }
2011  if(PanelMode) count += (stride-offset-depth);
2012  }
2013 }
2014 
2015 // this version is optimized for row major matrices
2016 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2017 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2018 {
2019  typedef typename packet_traits<Scalar>::type Packet;
2020  typedef typename DataMapper::LinearMapper LinearMapper;
2021  enum { PacketSize = packet_traits<Scalar>::size };
2022  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2023 };
2024 
2025 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2026 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2027  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2028 {
2029  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2030  EIGEN_UNUSED_VARIABLE(stride);
2031  EIGEN_UNUSED_VARIABLE(offset);
2032  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2033  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2034  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2035  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2036  Index count = 0;
2037 
2038 // if(nr>=8)
2039 // {
2040 // for(Index j2=0; j2<packet_cols8; j2+=8)
2041 // {
2042 // // skip what we have before
2043 // if(PanelMode) count += 8 * offset;
2044 // for(Index k=0; k<depth; k++)
2045 // {
2046 // if (PacketSize==8) {
2047 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2048 // pstoreu(blockB+count, cj.pconj(A));
2049 // } else if (PacketSize==4) {
2050 // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2051 // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2052 // pstoreu(blockB+count, cj.pconj(A));
2053 // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2054 // } else {
2055 // const Scalar* b0 = &rhs[k*rhsStride + j2];
2056 // blockB[count+0] = cj(b0[0]);
2057 // blockB[count+1] = cj(b0[1]);
2058 // blockB[count+2] = cj(b0[2]);
2059 // blockB[count+3] = cj(b0[3]);
2060 // blockB[count+4] = cj(b0[4]);
2061 // blockB[count+5] = cj(b0[5]);
2062 // blockB[count+6] = cj(b0[6]);
2063 // blockB[count+7] = cj(b0[7]);
2064 // }
2065 // count += 8;
2066 // }
2067 // // skip what we have after
2068 // if(PanelMode) count += 8 * (stride-offset-depth);
2069 // }
2070 // }
2071  if(nr>=4)
2072  {
2073  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2074  {
2075  // skip what we have before
2076  if(PanelMode) count += 4 * offset;
2077  for(Index k=0; k<depth; k++)
2078  {
2079  if (PacketSize==4) {
2080  Packet A = rhs.loadPacket(k, j2);
2081  pstoreu(blockB+count, cj.pconj(A));
2082  count += PacketSize;
2083  } else {
2084  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2085  blockB[count+0] = cj(dm0(0));
2086  blockB[count+1] = cj(dm0(1));
2087  blockB[count+2] = cj(dm0(2));
2088  blockB[count+3] = cj(dm0(3));
2089  count += 4;
2090  }
2091  }
2092  // skip what we have after
2093  if(PanelMode) count += 4 * (stride-offset-depth);
2094  }
2095  }
2096  // copy the remaining columns one at a time (nr==1)
2097  for(Index j2=packet_cols4; j2<cols; ++j2)
2098  {
2099  if(PanelMode) count += offset;
2100  for(Index k=0; k<depth; k++)
2101  {
2102  blockB[count] = cj(rhs(k, j2));
2103  count += 1;
2104  }
2105  if(PanelMode) count += stride-offset-depth;
2106  }
2107 }
2108 
2109 } // end namespace internal
2110 
2113 inline std::ptrdiff_t l1CacheSize()
2114 {
2115  std::ptrdiff_t l1, l2, l3;
2116  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2117  return l1;
2118 }
2119 
2122 inline std::ptrdiff_t l2CacheSize()
2123 {
2124  std::ptrdiff_t l1, l2, l3;
2125  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2126  return l2;
2127 }
2128 
2132 inline std::ptrdiff_t l3CacheSize()
2133 {
2134  std::ptrdiff_t l1, l2, l3;
2135  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2136  return l3;
2137 }
2138 
2144 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2145 {
2146  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2147 }
2148 
2149 } // end namespace Eigen
2150 
2151 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: GeneralBlockPanelKernel.h:2144
Definition: Constants.h:320
Namespace containing all symbols from the Eigen library.
Definition: Core:271
Definition: Half.h:502
std::ptrdiff_t l3CacheSize()
Definition: GeneralBlockPanelKernel.h:2132
std::ptrdiff_t l2CacheSize()
Definition: GeneralBlockPanelKernel.h:2122
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_imag_op< typename Derived::Scalar >, const Derived > imag(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: XprHelper.h:35
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_real_op< typename Derived::Scalar >, const Derived > real(const Eigen::ArrayBase< Derived > &x)
Definition: Eigen_Colamd.h:50
Definition: Constants.h:322
std::ptrdiff_t l1CacheSize()
Definition: GeneralBlockPanelKernel.h:2113