10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 11 #define EIGEN_GENERAL_BLOCK_PANEL_H 18 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false>
23 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
28 #if EIGEN_ARCH_i386_OR_x86_64 29 const std::ptrdiff_t defaultL1CacheSize = 32*1024;
30 const std::ptrdiff_t defaultL2CacheSize = 256*1024;
31 const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
33 const std::ptrdiff_t defaultL1CacheSize = 16*1024;
34 const std::ptrdiff_t defaultL2CacheSize = 512*1024;
35 const std::ptrdiff_t defaultL3CacheSize = 512*1024;
40 CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
42 queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
43 m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
44 m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
45 m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
55 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
57 static CacheSizes m_cacheSizes;
62 eigen_internal_assert(l1!=0 && l2!=0);
63 m_cacheSizes.m_l1 = *l1;
64 m_cacheSizes.m_l2 = *l2;
65 m_cacheSizes.m_l3 = *l3;
67 else if(action==GetAction)
69 eigen_internal_assert(l1!=0 && l2!=0);
70 *l1 = m_cacheSizes.m_l1;
71 *l2 = m_cacheSizes.m_l2;
72 *l3 = m_cacheSizes.m_l3;
76 eigen_internal_assert(
false);
92 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
95 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
102 std::ptrdiff_t l1, l2, l3;
103 manage_caching_sizes(GetAction, &l1, &l2, &l3);
105 if (num_threads > 1) {
106 typedef typename Traits::ResScalar ResScalar;
108 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
109 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
118 const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
120 k = k_cache - (k_cache % kr);
121 eigen_internal_assert(k > 0);
124 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
125 const Index n_per_thread = numext::div_ceil(n, num_threads);
126 if (n_cache <= n_per_thread) {
128 eigen_internal_assert(n_cache >= static_cast<Index>(nr));
129 n = n_cache - (n_cache % nr);
130 eigen_internal_assert(n > 0);
132 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
137 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
138 const Index m_per_thread = numext::div_ceil(m, num_threads);
139 if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
140 m = m_cache - (m_cache % mr);
141 eigen_internal_assert(m > 0);
143 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
150 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 160 if((numext::maxi)(k,(numext::maxi)(m,n))<48)
163 typedef typename Traits::ResScalar ResScalar;
166 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
167 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
177 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
178 const Index old_k = k;
184 k = (k%max_kc)==0 ? max_kc
185 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
187 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
196 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS 197 const Index actual_l2 = l3;
199 const Index actual_l2 = 1572864;
209 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
210 const Index remaining_l1 = l1- k_sub - lhs_bytes;
211 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
214 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
219 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
222 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
230 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
237 Index problem_size = k*n*
sizeof(LhsScalar);
238 Index actual_lm = actual_l2;
240 if(problem_size<=1024)
246 else if(l3!=0 && problem_size<=32768)
251 max_mc = (numext::mini<Index>)(576,max_mc);
253 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
254 if (mc > Traits::mr) mc -= mc % Traits::mr;
255 else if (mc==0)
return;
257 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
262 template <
typename Index>
265 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES 266 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
267 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
268 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
269 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
273 EIGEN_UNUSED_VARIABLE(k)
274 EIGEN_UNUSED_VARIABLE(m)
275 EIGEN_UNUSED_VARIABLE(n)
296 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
299 if (!useSpecificBlockingSizes(k, m, n)) {
300 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
304 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
307 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
310 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 311 #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 316 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
struct gebp_madd_selector {
317 EIGEN_ALWAYS_INLINE
static void run(
const CJ& cj, A& a, B& b, C& c, T& )
323 template<
typename CJ,
typename T>
struct gebp_madd_selector<CJ,T,T,T,T> {
324 EIGEN_ALWAYS_INLINE
static void run(
const CJ& cj, T& a, T& b, T& c, T& t)
326 t = b; t = cj.pmul(a,t); c = padd(c,t);
330 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
331 EIGEN_STRONG_INLINE
void gebp_madd(
const CJ& cj, A& a, B& b, C& c, T& t)
333 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
336 #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 350 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs>
354 typedef _LhsScalar LhsScalar;
355 typedef _RhsScalar RhsScalar;
356 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
361 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
362 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
363 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
364 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
366 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
372 default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
373 #
if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
377 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
382 LhsProgress = LhsPacketSize,
386 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
387 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
388 typedef typename packet_traits<ResScalar>::type _ResPacket;
390 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
391 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
392 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
394 typedef ResPacket AccPacket;
396 EIGEN_STRONG_INLINE
void initAcc(AccPacket& p)
398 p = pset1<ResPacket>(ResScalar(0));
401 EIGEN_STRONG_INLINE
void broadcastRhs(
const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
403 pbroadcast4(b, b0, b1, b2, b3);
411 template<
typename RhsPacketType>
412 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const 414 dest = pset1<RhsPacketType>(*b);
417 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const 419 dest = ploadquad<RhsPacket>(b);
422 template<
typename LhsPacketType>
423 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacketType& dest)
const 425 dest = pload<LhsPacketType>(a);
428 template<
typename LhsPacketType>
429 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const 431 dest = ploadu<LhsPacketType>(a);
434 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
435 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp)
const 441 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 442 EIGEN_UNUSED_VARIABLE(tmp);
445 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
449 EIGEN_STRONG_INLINE
void acc(
const AccPacket& c,
const ResPacket& alpha, ResPacket& r)
const 451 r = pmadd(c,alpha,r);
454 template<
typename ResPacketHalf>
455 EIGEN_STRONG_INLINE
void acc(
const ResPacketHalf& c,
const ResPacketHalf& alpha, ResPacketHalf& r)
const 457 r = pmadd(c,alpha,r);
465 template<
typename RealScalar,
bool _ConjLhs>
466 class gebp_traits<
std::complex<RealScalar>, RealScalar, _ConjLhs, false>
469 typedef std::complex<RealScalar> LhsScalar;
470 typedef RealScalar RhsScalar;
471 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
476 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
477 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
478 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
479 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
481 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
483 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) 485 mr = 3*LhsPacketSize,
487 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
490 LhsProgress = LhsPacketSize,
494 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
495 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
496 typedef typename packet_traits<ResScalar>::type _ResPacket;
498 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
499 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
500 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
502 typedef ResPacket AccPacket;
504 EIGEN_STRONG_INLINE
void initAcc(AccPacket& p)
506 p = pset1<ResPacket>(ResScalar(0));
509 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacket& dest)
const 511 dest = pset1<RhsPacket>(*b);
514 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const 516 dest = pset1<RhsPacket>(*b);
519 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacket& dest)
const 521 dest = pload<LhsPacket>(a);
524 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacket& dest)
const 526 dest = ploadu<LhsPacket>(a);
529 EIGEN_STRONG_INLINE
void broadcastRhs(
const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
531 pbroadcast4(b, b0, b1, b2, b3);
539 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, AccPacket& c, RhsPacket& tmp)
const 541 madd_impl(a, b, c, tmp,
typename conditional<Vectorizable,true_type,false_type>::type());
544 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacket& a,
const RhsPacket& b, AccPacket& c, RhsPacket& tmp,
const true_type&)
const 546 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 547 EIGEN_UNUSED_VARIABLE(tmp);
548 c.v = pmadd(a.v,b,c.v);
550 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
554 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const 559 EIGEN_STRONG_INLINE
void acc(
const AccPacket& c,
const ResPacket& alpha, ResPacket& r)
const 561 r = cj.pmadd(c,alpha,r);
565 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
568 template<
typename Packet>
575 template<
typename Packet>
576 DoublePacket<Packet> padd(
const DoublePacket<Packet> &a,
const DoublePacket<Packet> &b)
578 DoublePacket<Packet> res;
579 res.first = padd(a.first, b.first);
580 res.second = padd(a.second,b.second);
584 template<
typename Packet>
585 const DoublePacket<Packet>& predux4(
const DoublePacket<Packet> &a)
590 template<
typename Packet>
struct unpacket_traits<DoublePacket<Packet> > {
typedef DoublePacket<Packet> half; };
600 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs>
601 class gebp_traits<
std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
604 typedef std::complex<RealScalar> Scalar;
605 typedef std::complex<RealScalar> LhsScalar;
606 typedef std::complex<RealScalar> RhsScalar;
607 typedef std::complex<RealScalar> ResScalar;
612 Vectorizable = packet_traits<RealScalar>::Vectorizable
613 && packet_traits<Scalar>::Vectorizable,
614 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
615 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
616 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
617 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
623 LhsProgress = ResPacketSize,
627 typedef typename packet_traits<RealScalar>::type RealPacket;
628 typedef typename packet_traits<Scalar>::type ScalarPacket;
629 typedef DoublePacket<RealPacket> DoublePacketType;
631 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
632 typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
633 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
634 typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
636 EIGEN_STRONG_INLINE
void initAcc(Scalar& p) { p = Scalar(0); }
638 EIGEN_STRONG_INLINE
void initAcc(DoublePacketType& p)
640 p.first = pset1<RealPacket>(RealScalar(0));
641 p.second = pset1<RealPacket>(RealScalar(0));
645 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, ResPacket& dest)
const 647 dest = pset1<ResPacket>(*b);
651 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, DoublePacketType& dest)
const 653 dest.first = pset1<RealPacket>(
real(*b));
654 dest.second = pset1<RealPacket>(
imag(*b));
657 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, ResPacket& dest)
const 661 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, DoublePacketType& dest)
const 663 eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
667 EIGEN_STRONG_INLINE
void broadcastRhs(
const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
677 EIGEN_STRONG_INLINE
void broadcastRhs(
const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
685 EIGEN_STRONG_INLINE
void broadcastRhs(
const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
693 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacket& dest)
const 695 dest = pload<LhsPacket>((
const typename unpacket_traits<LhsPacket>::type*)(a));
698 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacket& dest)
const 700 dest = ploadu<LhsPacket>((
const typename unpacket_traits<LhsPacket>::type*)(a));
703 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, DoublePacketType& c, RhsPacket& )
const 705 c.first = padd(pmul(a,b.first), c.first);
706 c.second = padd(pmul(a,b.second),c.second);
709 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, ResPacket& c, RhsPacket& )
const 714 EIGEN_STRONG_INLINE
void acc(
const Scalar& c,
const Scalar& alpha, Scalar& r)
const { r += alpha * c; }
716 EIGEN_STRONG_INLINE
void acc(
const DoublePacketType& c,
const ResPacket& alpha, ResPacket& r)
const 720 if((!ConjLhs)&&(!ConjRhs))
722 tmp = pcplxflip(pconj(ResPacket(c.second)));
723 tmp = padd(ResPacket(c.first),tmp);
725 else if((!ConjLhs)&&(ConjRhs))
727 tmp = pconj(pcplxflip(ResPacket(c.second)));
728 tmp = padd(ResPacket(c.first),tmp);
730 else if((ConjLhs)&&(!ConjRhs))
732 tmp = pcplxflip(ResPacket(c.second));
733 tmp = padd(pconj(ResPacket(c.first)),tmp);
735 else if((ConjLhs)&&(ConjRhs))
737 tmp = pcplxflip(ResPacket(c.second));
738 tmp = psub(pconj(ResPacket(c.first)),tmp);
741 r = pmadd(tmp,alpha,r);
745 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
748 template<
typename RealScalar,
bool _ConjRhs>
749 class gebp_traits<RealScalar,
std::complex<RealScalar>, false, _ConjRhs >
752 typedef std::complex<RealScalar> Scalar;
753 typedef RealScalar LhsScalar;
754 typedef Scalar RhsScalar;
755 typedef Scalar ResScalar;
760 Vectorizable = packet_traits<RealScalar>::Vectorizable
761 && packet_traits<Scalar>::Vectorizable,
762 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
763 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
764 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
766 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
769 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
771 LhsProgress = ResPacketSize,
775 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
776 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
777 typedef typename packet_traits<ResScalar>::type _ResPacket;
779 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
780 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
781 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
783 typedef ResPacket AccPacket;
785 EIGEN_STRONG_INLINE
void initAcc(AccPacket& p)
787 p = pset1<ResPacket>(ResScalar(0));
790 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacket& dest)
const 792 dest = pset1<RhsPacket>(*b);
795 void broadcastRhs(
const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
797 pbroadcast4(b, b0, b1, b2, b3);
807 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacket& dest)
const 809 dest = ploaddup<LhsPacket>(a);
812 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const 814 eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
818 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacket& dest)
const 820 dest = ploaddup<LhsPacket>(a);
823 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, AccPacket& c, RhsPacket& tmp)
const 825 madd_impl(a, b, c, tmp,
typename conditional<Vectorizable,true_type,false_type>::type());
828 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacket& a,
const RhsPacket& b, AccPacket& c, RhsPacket& tmp,
const true_type&)
const 830 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 831 EIGEN_UNUSED_VARIABLE(tmp);
832 c.v = pmadd(a,b.v,c.v);
834 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
839 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const 844 EIGEN_STRONG_INLINE
void acc(
const AccPacket& c,
const ResPacket& alpha, ResPacket& r)
const 846 r = cj.pmadd(alpha,c,r);
850 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
860 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
863 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
864 typedef typename Traits::ResScalar ResScalar;
865 typedef typename Traits::LhsPacket LhsPacket;
866 typedef typename Traits::RhsPacket RhsPacket;
867 typedef typename Traits::ResPacket ResPacket;
868 typedef typename Traits::AccPacket AccPacket;
870 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
871 typedef typename SwappedTraits::ResScalar SResScalar;
872 typedef typename SwappedTraits::LhsPacket SLhsPacket;
873 typedef typename SwappedTraits::RhsPacket SRhsPacket;
874 typedef typename SwappedTraits::ResPacket SResPacket;
875 typedef typename SwappedTraits::AccPacket SAccPacket;
877 typedef typename DataMapper::LinearMapper LinearMapper;
880 Vectorizable = Traits::Vectorizable,
881 LhsProgress = Traits::LhsProgress,
882 RhsProgress = Traits::RhsProgress,
883 ResPacketSize = Traits::ResPacketSize
887 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
892 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
894 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
895 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
900 SwappedTraits straits;
902 if(strideA==-1) strideA = depth;
903 if(strideB==-1) strideB = depth;
904 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
905 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
906 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
907 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
908 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
910 const Index peeled_kc = depth & ~(pk-1);
911 const Index prefetch_res_offset = 32/
sizeof(ResScalar);
917 if(mr>=3*Traits::LhsProgress)
924 const Index l1 = defaultL1CacheSize;
928 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
929 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
931 const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
932 for(
Index j2=0; j2<packet_cols4; j2+=nr)
934 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
940 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
944 AccPacket C0, C1, C2, C3,
947 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
948 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
949 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
951 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
952 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
953 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
954 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
962 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
966 for(
Index k=0; k<peeled_kc; k+=pk)
968 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX4");
972 #define EIGEN_GEBP_ONESTEP(K) \ 974 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ 975 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 976 internal::prefetch(blA+(3*K+16)*LhsProgress); \ 977 if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); \ 978 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 979 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 980 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 981 traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ 982 traits.madd(A0, B_0, C0, T0); \ 983 traits.madd(A1, B_0, C4, T0); \ 984 traits.madd(A2, B_0, C8, B_0); \ 985 traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ 986 traits.madd(A0, B_0, C1, T0); \ 987 traits.madd(A1, B_0, C5, T0); \ 988 traits.madd(A2, B_0, C9, B_0); \ 989 traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ 990 traits.madd(A0, B_0, C2, T0); \ 991 traits.madd(A1, B_0, C6, T0); \ 992 traits.madd(A2, B_0, C10, B_0); \ 993 traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ 994 traits.madd(A0, B_0, C3 , T0); \ 995 traits.madd(A1, B_0, C7, T0); \ 996 traits.madd(A2, B_0, C11, B_0); \ 997 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ 1000 internal::prefetch(blB);
1001 EIGEN_GEBP_ONESTEP(0);
1002 EIGEN_GEBP_ONESTEP(1);
1003 EIGEN_GEBP_ONESTEP(2);
1004 EIGEN_GEBP_ONESTEP(3);
1005 EIGEN_GEBP_ONESTEP(4);
1006 EIGEN_GEBP_ONESTEP(5);
1007 EIGEN_GEBP_ONESTEP(6);
1008 EIGEN_GEBP_ONESTEP(7);
1010 blB += pk*4*RhsProgress;
1011 blA += pk*3*Traits::LhsProgress;
1013 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX4");
1016 for(
Index k=peeled_kc; k<depth; k++)
1020 EIGEN_GEBP_ONESTEP(0);
1021 blB += 4*RhsProgress;
1022 blA += 3*Traits::LhsProgress;
1025 #undef EIGEN_GEBP_ONESTEP 1027 ResPacket R0, R1, R2;
1028 ResPacket alphav = pset1<ResPacket>(alpha);
1030 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1031 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1032 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1033 traits.acc(C0, alphav, R0);
1034 traits.acc(C4, alphav, R1);
1035 traits.acc(C8, alphav, R2);
1036 r0.storePacket(0 * Traits::ResPacketSize, R0);
1037 r0.storePacket(1 * Traits::ResPacketSize, R1);
1038 r0.storePacket(2 * Traits::ResPacketSize, R2);
1040 R0 = r1.loadPacket(0 * Traits::ResPacketSize);
1041 R1 = r1.loadPacket(1 * Traits::ResPacketSize);
1042 R2 = r1.loadPacket(2 * Traits::ResPacketSize);
1043 traits.acc(C1, alphav, R0);
1044 traits.acc(C5, alphav, R1);
1045 traits.acc(C9, alphav, R2);
1046 r1.storePacket(0 * Traits::ResPacketSize, R0);
1047 r1.storePacket(1 * Traits::ResPacketSize, R1);
1048 r1.storePacket(2 * Traits::ResPacketSize, R2);
1050 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1051 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1052 R2 = r2.loadPacket(2 * Traits::ResPacketSize);
1053 traits.acc(C2, alphav, R0);
1054 traits.acc(C6, alphav, R1);
1055 traits.acc(C10, alphav, R2);
1056 r2.storePacket(0 * Traits::ResPacketSize, R0);
1057 r2.storePacket(1 * Traits::ResPacketSize, R1);
1058 r2.storePacket(2 * Traits::ResPacketSize, R2);
1060 R0 = r3.loadPacket(0 * Traits::ResPacketSize);
1061 R1 = r3.loadPacket(1 * Traits::ResPacketSize);
1062 R2 = r3.loadPacket(2 * Traits::ResPacketSize);
1063 traits.acc(C3, alphav, R0);
1064 traits.acc(C7, alphav, R1);
1065 traits.acc(C11, alphav, R2);
1066 r3.storePacket(0 * Traits::ResPacketSize, R0);
1067 r3.storePacket(1 * Traits::ResPacketSize, R1);
1068 r3.storePacket(2 * Traits::ResPacketSize, R2);
1073 for(
Index j2=packet_cols4; j2<cols; j2++)
1075 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1078 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1082 AccPacket C0, C4, C8;
1087 LinearMapper r0 = res.getLinearMapper(i, j2);
1091 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1092 LhsPacket A0, A1, A2;
1094 for(
Index k=0; k<peeled_kc; k+=pk)
1096 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX1");
1098 #define EIGEN_GEBGP_ONESTEP(K) \ 1100 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ 1101 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1102 traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ 1103 traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ 1104 traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ 1105 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1106 traits.madd(A0, B_0, C0, B_0); \ 1107 traits.madd(A1, B_0, C4, B_0); \ 1108 traits.madd(A2, B_0, C8, B_0); \ 1109 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ 1112 EIGEN_GEBGP_ONESTEP(0);
1113 EIGEN_GEBGP_ONESTEP(1);
1114 EIGEN_GEBGP_ONESTEP(2);
1115 EIGEN_GEBGP_ONESTEP(3);
1116 EIGEN_GEBGP_ONESTEP(4);
1117 EIGEN_GEBGP_ONESTEP(5);
1118 EIGEN_GEBGP_ONESTEP(6);
1119 EIGEN_GEBGP_ONESTEP(7);
1121 blB += pk*RhsProgress;
1122 blA += pk*3*Traits::LhsProgress;
1124 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX1");
1128 for(
Index k=peeled_kc; k<depth; k++)
1131 EIGEN_GEBGP_ONESTEP(0);
1133 blA += 3*Traits::LhsProgress;
1135 #undef EIGEN_GEBGP_ONESTEP 1136 ResPacket R0, R1, R2;
1137 ResPacket alphav = pset1<ResPacket>(alpha);
1139 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1140 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1141 R2 = r0.loadPacket(2 * Traits::ResPacketSize);
1142 traits.acc(C0, alphav, R0);
1143 traits.acc(C4, alphav, R1);
1144 traits.acc(C8, alphav, R2);
1145 r0.storePacket(0 * Traits::ResPacketSize, R0);
1146 r0.storePacket(1 * Traits::ResPacketSize, R1);
1147 r0.storePacket(2 * Traits::ResPacketSize, R2);
1154 if(mr>=2*Traits::LhsProgress)
1156 const Index l1 = defaultL1CacheSize;
1160 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1162 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1164 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1165 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1167 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1173 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1177 AccPacket C0, C1, C2, C3,
1179 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1180 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1182 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1183 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1184 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1185 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1187 r0.prefetch(prefetch_res_offset);
1188 r1.prefetch(prefetch_res_offset);
1189 r2.prefetch(prefetch_res_offset);
1190 r3.prefetch(prefetch_res_offset);
1193 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1197 for(
Index k=0; k<peeled_kc; k+=pk)
1199 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX4");
1200 RhsPacket B_0, B1, B2, B3, T0;
1202 #define EIGEN_GEBGP_ONESTEP(K) \ 1204 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ 1205 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1206 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1207 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1208 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1209 traits.madd(A0, B_0, C0, T0); \ 1210 traits.madd(A1, B_0, C4, B_0); \ 1211 traits.madd(A0, B1, C1, T0); \ 1212 traits.madd(A1, B1, C5, B1); \ 1213 traits.madd(A0, B2, C2, T0); \ 1214 traits.madd(A1, B2, C6, B2); \ 1215 traits.madd(A0, B3, C3, T0); \ 1216 traits.madd(A1, B3, C7, B3); \ 1217 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ 1220 internal::prefetch(blB+(48+0));
1221 EIGEN_GEBGP_ONESTEP(0);
1222 EIGEN_GEBGP_ONESTEP(1);
1223 EIGEN_GEBGP_ONESTEP(2);
1224 EIGEN_GEBGP_ONESTEP(3);
1225 internal::prefetch(blB+(48+16));
1226 EIGEN_GEBGP_ONESTEP(4);
1227 EIGEN_GEBGP_ONESTEP(5);
1228 EIGEN_GEBGP_ONESTEP(6);
1229 EIGEN_GEBGP_ONESTEP(7);
1231 blB += pk*4*RhsProgress;
1232 blA += pk*(2*Traits::LhsProgress);
1234 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX4");
1237 for(
Index k=peeled_kc; k<depth; k++)
1239 RhsPacket B_0, B1, B2, B3, T0;
1240 EIGEN_GEBGP_ONESTEP(0);
1241 blB += 4*RhsProgress;
1242 blA += 2*Traits::LhsProgress;
1244 #undef EIGEN_GEBGP_ONESTEP 1246 ResPacket R0, R1, R2, R3;
1247 ResPacket alphav = pset1<ResPacket>(alpha);
1249 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1250 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1251 R2 = r1.loadPacket(0 * Traits::ResPacketSize);
1252 R3 = r1.loadPacket(1 * Traits::ResPacketSize);
1253 traits.acc(C0, alphav, R0);
1254 traits.acc(C4, alphav, R1);
1255 traits.acc(C1, alphav, R2);
1256 traits.acc(C5, alphav, R3);
1257 r0.storePacket(0 * Traits::ResPacketSize, R0);
1258 r0.storePacket(1 * Traits::ResPacketSize, R1);
1259 r1.storePacket(0 * Traits::ResPacketSize, R2);
1260 r1.storePacket(1 * Traits::ResPacketSize, R3);
1262 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1263 R1 = r2.loadPacket(1 * Traits::ResPacketSize);
1264 R2 = r3.loadPacket(0 * Traits::ResPacketSize);
1265 R3 = r3.loadPacket(1 * Traits::ResPacketSize);
1266 traits.acc(C2, alphav, R0);
1267 traits.acc(C6, alphav, R1);
1268 traits.acc(C3, alphav, R2);
1269 traits.acc(C7, alphav, R3);
1270 r2.storePacket(0 * Traits::ResPacketSize, R0);
1271 r2.storePacket(1 * Traits::ResPacketSize, R1);
1272 r3.storePacket(0 * Traits::ResPacketSize, R2);
1273 r3.storePacket(1 * Traits::ResPacketSize, R3);
1278 for(
Index j2=packet_cols4; j2<cols; j2++)
1280 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1283 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1291 LinearMapper r0 = res.getLinearMapper(i, j2);
1292 r0.prefetch(prefetch_res_offset);
1295 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1298 for(
Index k=0; k<peeled_kc; k+=pk)
1300 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX1");
1303 #define EIGEN_GEBGP_ONESTEP(K) \ 1305 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ 1306 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1307 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ 1308 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ 1309 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1310 traits.madd(A0, B_0, C0, B1); \ 1311 traits.madd(A1, B_0, C4, B_0); \ 1312 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ 1315 EIGEN_GEBGP_ONESTEP(0);
1316 EIGEN_GEBGP_ONESTEP(1);
1317 EIGEN_GEBGP_ONESTEP(2);
1318 EIGEN_GEBGP_ONESTEP(3);
1319 EIGEN_GEBGP_ONESTEP(4);
1320 EIGEN_GEBGP_ONESTEP(5);
1321 EIGEN_GEBGP_ONESTEP(6);
1322 EIGEN_GEBGP_ONESTEP(7);
1324 blB += pk*RhsProgress;
1325 blA += pk*2*Traits::LhsProgress;
1327 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX1");
1331 for(
Index k=peeled_kc; k<depth; k++)
1334 EIGEN_GEBGP_ONESTEP(0);
1336 blA += 2*Traits::LhsProgress;
1338 #undef EIGEN_GEBGP_ONESTEP 1340 ResPacket alphav = pset1<ResPacket>(alpha);
1342 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1343 R1 = r0.loadPacket(1 * Traits::ResPacketSize);
1344 traits.acc(C0, alphav, R0);
1345 traits.acc(C4, alphav, R1);
1346 r0.storePacket(0 * Traits::ResPacketSize, R0);
1347 r0.storePacket(1 * Traits::ResPacketSize, R1);
1353 if(mr>=1*Traits::LhsProgress)
1356 for(
Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
1359 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1364 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1368 AccPacket C0, C1, C2, C3;
1374 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1375 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1376 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1377 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1379 r0.prefetch(prefetch_res_offset);
1380 r1.prefetch(prefetch_res_offset);
1381 r2.prefetch(prefetch_res_offset);
1382 r3.prefetch(prefetch_res_offset);
1385 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1389 for(
Index k=0; k<peeled_kc; k+=pk)
1391 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1pX4");
1392 RhsPacket B_0, B1, B2, B3;
1394 #define EIGEN_GEBGP_ONESTEP(K) \ 1396 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ 1397 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1398 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1399 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ 1400 traits.madd(A0, B_0, C0, B_0); \ 1401 traits.madd(A0, B1, C1, B1); \ 1402 traits.madd(A0, B2, C2, B2); \ 1403 traits.madd(A0, B3, C3, B3); \ 1404 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ 1407 internal::prefetch(blB+(48+0));
1408 EIGEN_GEBGP_ONESTEP(0);
1409 EIGEN_GEBGP_ONESTEP(1);
1410 EIGEN_GEBGP_ONESTEP(2);
1411 EIGEN_GEBGP_ONESTEP(3);
1412 internal::prefetch(blB+(48+16));
1413 EIGEN_GEBGP_ONESTEP(4);
1414 EIGEN_GEBGP_ONESTEP(5);
1415 EIGEN_GEBGP_ONESTEP(6);
1416 EIGEN_GEBGP_ONESTEP(7);
1418 blB += pk*4*RhsProgress;
1419 blA += pk*1*LhsProgress;
1421 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1pX4");
1424 for(
Index k=peeled_kc; k<depth; k++)
1426 RhsPacket B_0, B1, B2, B3;
1427 EIGEN_GEBGP_ONESTEP(0);
1428 blB += 4*RhsProgress;
1429 blA += 1*LhsProgress;
1431 #undef EIGEN_GEBGP_ONESTEP 1434 ResPacket alphav = pset1<ResPacket>(alpha);
1436 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1437 R1 = r1.loadPacket(0 * Traits::ResPacketSize);
1438 traits.acc(C0, alphav, R0);
1439 traits.acc(C1, alphav, R1);
1440 r0.storePacket(0 * Traits::ResPacketSize, R0);
1441 r1.storePacket(0 * Traits::ResPacketSize, R1);
1443 R0 = r2.loadPacket(0 * Traits::ResPacketSize);
1444 R1 = r3.loadPacket(0 * Traits::ResPacketSize);
1445 traits.acc(C2, alphav, R0);
1446 traits.acc(C3, alphav, R1);
1447 r2.storePacket(0 * Traits::ResPacketSize, R0);
1448 r3.storePacket(0 * Traits::ResPacketSize, R1);
1452 for(
Index j2=packet_cols4; j2<cols; j2++)
1455 const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
1462 LinearMapper r0 = res.getLinearMapper(i, j2);
1465 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1468 for(
Index k=0; k<peeled_kc; k+=pk)
1470 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1pX1");
1473 #define EIGEN_GEBGP_ONESTEP(K) \ 1475 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ 1476 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ 1477 traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ 1478 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ 1479 traits.madd(A0, B_0, C0, B_0); \ 1480 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ 1483 EIGEN_GEBGP_ONESTEP(0);
1484 EIGEN_GEBGP_ONESTEP(1);
1485 EIGEN_GEBGP_ONESTEP(2);
1486 EIGEN_GEBGP_ONESTEP(3);
1487 EIGEN_GEBGP_ONESTEP(4);
1488 EIGEN_GEBGP_ONESTEP(5);
1489 EIGEN_GEBGP_ONESTEP(6);
1490 EIGEN_GEBGP_ONESTEP(7);
1492 blB += pk*RhsProgress;
1493 blA += pk*1*Traits::LhsProgress;
1495 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1pX1");
1499 for(
Index k=peeled_kc; k<depth; k++)
1502 EIGEN_GEBGP_ONESTEP(0);
1504 blA += 1*Traits::LhsProgress;
1506 #undef EIGEN_GEBGP_ONESTEP 1508 ResPacket alphav = pset1<ResPacket>(alpha);
1509 R0 = r0.loadPacket(0 * Traits::ResPacketSize);
1510 traits.acc(C0, alphav, R0);
1511 r0.storePacket(0 * Traits::ResPacketSize, R0);
1519 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1522 for(
Index i=peeled_mc1; i<rows; i+=1)
1524 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1526 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1531 typedef typename unpacket_traits<SResPacket>::half SResPacketHalf;
1532 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1533 (SwappedTraits::LhsProgress <= 8) &&
1534 (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr))
1536 SAccPacket C0, C1, C2, C3;
1537 straits.initAcc(C0);
1538 straits.initAcc(C1);
1539 straits.initAcc(C2);
1540 straits.initAcc(C3);
1542 const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1543 const Index endk = (depth/spk)*spk;
1544 const Index endk4 = (depth/(spk*4))*(spk*4);
1547 for(; k<endk4; k+=4*spk)
1552 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1553 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1555 straits.loadRhsQuad(blA+0*spk, B_0);
1556 straits.loadRhsQuad(blA+1*spk, B_1);
1557 straits.madd(A0,B_0,C0,B_0);
1558 straits.madd(A1,B_1,C1,B_1);
1560 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1561 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1562 straits.loadRhsQuad(blA+2*spk, B_0);
1563 straits.loadRhsQuad(blA+3*spk, B_1);
1564 straits.madd(A0,B_0,C2,B_0);
1565 straits.madd(A1,B_1,C3,B_1);
1567 blB += 4*SwappedTraits::LhsProgress;
1570 C0 = padd(padd(C0,C1),padd(C2,C3));
1571 for(; k<endk; k+=spk)
1576 straits.loadLhsUnaligned(blB, A0);
1577 straits.loadRhsQuad(blA, B_0);
1578 straits.madd(A0,B_0,C0,B_0);
1580 blB += SwappedTraits::LhsProgress;
1583 if(SwappedTraits::LhsProgress==8)
1586 typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1587 typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1588 typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1589 typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1591 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1592 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1599 straits.loadLhsUnaligned(blB, a0);
1600 straits.loadRhs(blA, b0);
1601 SAccPacketHalf c0 = predux4(C0);
1602 straits.madd(a0,b0,c0,b0);
1603 straits.acc(c0, alphav, R);
1607 straits.acc(predux4(C0), alphav, R);
1609 res.scatterPacket(i, j2, R);
1613 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
1614 SResPacket alphav = pset1<SResPacket>(alpha);
1615 straits.acc(C0, alphav, R);
1616 res.scatterPacket(i, j2, R);
1622 ResScalar C0(0), C1(0), C2(0), C3(0);
1624 for(
Index k=0; k<depth; k++)
1633 CJMADD(cj,A0,B_0,C0, B_0);
1634 CJMADD(cj,A0,B_1,C1, B_1);
1638 CJMADD(cj,A0,B_0,C2, B_0);
1639 CJMADD(cj,A0,B_1,C3, B_1);
1643 res(i, j2 + 0) += alpha * C0;
1644 res(i, j2 + 1) += alpha * C1;
1645 res(i, j2 + 2) += alpha * C2;
1646 res(i, j2 + 3) += alpha * C3;
1651 for(
Index j2=packet_cols4; j2<cols; j2++)
1654 for(
Index i=peeled_mc1; i<rows; i+=1)
1656 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1660 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1661 for(
Index k=0; k<depth; k++)
1663 LhsScalar A0 = blA[k];
1664 RhsScalar B_0 = blB[k];
1665 CJMADD(cj, A0, B_0, C0, B_0);
1667 res(i, j2) += alpha * C0;
1690 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1691 struct gemm_pack_lhs<Scalar,
Index, DataMapper, Pack1, Pack2,
ColMajor, Conjugate, PanelMode>
1693 typedef typename DataMapper::LinearMapper LinearMapper;
1694 EIGEN_DONT_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
1697 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1698 EIGEN_DONT_INLINE
void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
1699 ::operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
1701 typedef typename packet_traits<Scalar>::type Packet;
1702 enum { PacketSize = packet_traits<Scalar>::size };
1704 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
1705 EIGEN_UNUSED_VARIABLE(stride);
1706 EIGEN_UNUSED_VARIABLE(offset);
1707 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1708 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
1709 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1712 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
1713 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
1714 const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
1715 const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
1716 : Pack2>1 ? (rows/Pack2)*Pack2 : 0;
1721 if(Pack1>=3*PacketSize)
1723 for(; i<peeled_mc3; i+=3*PacketSize)
1725 if(PanelMode) count += (3*PacketSize) * offset;
1727 for(
Index k=0; k<depth; k++)
1730 A = lhs.loadPacket(i+0*PacketSize, k);
1731 B = lhs.loadPacket(i+1*PacketSize, k);
1732 C = lhs.loadPacket(i+2*PacketSize, k);
1733 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1734 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1735 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
1737 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
1741 if(Pack1>=2*PacketSize)
1743 for(; i<peeled_mc2; i+=2*PacketSize)
1745 if(PanelMode) count += (2*PacketSize) * offset;
1747 for(
Index k=0; k<depth; k++)
1750 A = lhs.loadPacket(i+0*PacketSize, k);
1751 B = lhs.loadPacket(i+1*PacketSize, k);
1752 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
1753 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
1755 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
1759 if(Pack1>=1*PacketSize)
1761 for(; i<peeled_mc1; i+=1*PacketSize)
1763 if(PanelMode) count += (1*PacketSize) * offset;
1765 for(
Index k=0; k<depth; k++)
1768 A = lhs.loadPacket(i+0*PacketSize, k);
1769 pstore(blockA+count, cj.pconj(A));
1772 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
1776 if(Pack2<PacketSize && Pack2>1)
1778 for(; i<peeled_mc0; i+=Pack2)
1780 if(PanelMode) count += Pack2 * offset;
1782 for(
Index k=0; k<depth; k++)
1783 for(
Index w=0; w<Pack2; w++)
1784 blockA[count++] = cj(lhs(i+w, k));
1786 if(PanelMode) count += Pack2 * (stride-offset-depth);
1791 if(PanelMode) count += offset;
1792 for(
Index k=0; k<depth; k++)
1793 blockA[count++] = cj(lhs(i, k));
1794 if(PanelMode) count += (stride-offset-depth);
1798 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1799 struct gemm_pack_lhs<Scalar,
Index, DataMapper, Pack1, Pack2,
RowMajor, Conjugate, PanelMode>
1801 typedef typename DataMapper::LinearMapper LinearMapper;
1802 EIGEN_DONT_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
1805 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
bool Conjugate,
bool PanelMode>
1806 EIGEN_DONT_INLINE
void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
1807 ::operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
1809 typedef typename packet_traits<Scalar>::type Packet;
1810 enum { PacketSize = packet_traits<Scalar>::size };
1812 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
1813 EIGEN_UNUSED_VARIABLE(stride);
1814 EIGEN_UNUSED_VARIABLE(offset);
1815 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1816 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1827 Index remaining_rows = rows-i;
1828 Index peeled_mc = i+(remaining_rows/pack)*pack;
1829 for(; i<peeled_mc; i+=pack)
1831 if(PanelMode) count += pack * offset;
1833 const Index peeled_k = (depth/PacketSize)*PacketSize;
1835 if(pack>=PacketSize)
1837 for(; k<peeled_k; k+=PacketSize)
1839 for (
Index m = 0; m < pack; m += PacketSize)
1841 PacketBlock<Packet> kernel;
1842 for (
int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
1844 for (
int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
1846 count += PacketSize*pack;
1852 for(; w<pack-3; w+=4)
1854 Scalar a(cj(lhs(i+w+0, k))),
1855 b(cj(lhs(i+w+1, k))),
1856 c(cj(lhs(i+w+2, k))),
1857 d(cj(lhs(i+w+3, k)));
1858 blockA[count++] = a;
1859 blockA[count++] = b;
1860 blockA[count++] = c;
1861 blockA[count++] = d;
1865 blockA[count++] = cj(lhs(i+w, k));
1868 if(PanelMode) count += pack * (stride-offset-depth);
1872 if(pack<Pack2 && (pack+PacketSize)!=Pack2)
1878 if(PanelMode) count += offset;
1879 for(
Index k=0; k<depth; k++)
1880 blockA[count++] = cj(lhs(i, k));
1881 if(PanelMode) count += (stride-offset-depth);
1892 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1893 struct gemm_pack_rhs<Scalar,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
1895 typedef typename packet_traits<Scalar>::type Packet;
1896 typedef typename DataMapper::LinearMapper LinearMapper;
1897 enum { PacketSize = packet_traits<Scalar>::size };
1898 EIGEN_DONT_INLINE
void operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
1901 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
1902 EIGEN_DONT_INLINE
void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
1903 ::operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
1905 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS COLMAJOR");
1906 EIGEN_UNUSED_VARIABLE(stride);
1907 EIGEN_UNUSED_VARIABLE(offset);
1908 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
1909 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
1910 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1911 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1913 const Index peeled_k = (depth/PacketSize)*PacketSize;
1962 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1965 if(PanelMode) count += 4 * offset;
1966 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
1967 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
1968 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
1969 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
1972 if((PacketSize%4)==0)
1974 for(; k<peeled_k; k+=PacketSize) {
1975 PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1976 kernel.packet[0] = dm0.loadPacket(k);
1977 kernel.packet[1%PacketSize] = dm1.loadPacket(k);
1978 kernel.packet[2%PacketSize] = dm2.loadPacket(k);
1979 kernel.packet[3%PacketSize] = dm3.loadPacket(k);
1981 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1982 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1983 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1984 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1985 count+=4*PacketSize;
1990 blockB[count+0] = cj(dm0(k));
1991 blockB[count+1] = cj(dm1(k));
1992 blockB[count+2] = cj(dm2(k));
1993 blockB[count+3] = cj(dm3(k));
1997 if(PanelMode) count += 4 * (stride-offset-depth);
2002 for(
Index j2=packet_cols4; j2<cols; ++j2)
2004 if(PanelMode) count += offset;
2005 const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2006 for(
Index k=0; k<depth; k++)
2008 blockB[count] = cj(dm0(k));
2011 if(PanelMode) count += (stride-offset-depth);
2016 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2017 struct gemm_pack_rhs<Scalar,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2019 typedef typename packet_traits<Scalar>::type Packet;
2020 typedef typename DataMapper::LinearMapper LinearMapper;
2021 enum { PacketSize = packet_traits<Scalar>::size };
2022 EIGEN_DONT_INLINE
void operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2025 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2026 EIGEN_DONT_INLINE
void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2027 ::operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2029 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS ROWMAJOR");
2030 EIGEN_UNUSED_VARIABLE(stride);
2031 EIGEN_UNUSED_VARIABLE(offset);
2032 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2033 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2034 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2035 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2073 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2076 if(PanelMode) count += 4 * offset;
2077 for(
Index k=0; k<depth; k++)
2079 if (PacketSize==4) {
2080 Packet A = rhs.loadPacket(k, j2);
2081 pstoreu(blockB+count, cj.pconj(A));
2082 count += PacketSize;
2084 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2085 blockB[count+0] = cj(dm0(0));
2086 blockB[count+1] = cj(dm0(1));
2087 blockB[count+2] = cj(dm0(2));
2088 blockB[count+3] = cj(dm0(3));
2093 if(PanelMode) count += 4 * (stride-offset-depth);
2097 for(
Index j2=packet_cols4; j2<cols; ++j2)
2099 if(PanelMode) count += offset;
2100 for(
Index k=0; k<depth; k++)
2102 blockB[count] = cj(rhs(k, j2));
2105 if(PanelMode) count += stride-offset-depth;
2115 std::ptrdiff_t l1, l2, l3;
2116 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2124 std::ptrdiff_t l1, l2, l3;
2125 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2134 std::ptrdiff_t l1, l2, l3;
2135 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2146 internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2151 #endif // EIGEN_GENERAL_BLOCK_PANEL_H void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: GeneralBlockPanelKernel.h:2144
Definition: Constants.h:320
Namespace containing all symbols from the Eigen library.
Definition: Core:271
std::ptrdiff_t l3CacheSize()
Definition: GeneralBlockPanelKernel.h:2132
std::ptrdiff_t l2CacheSize()
Definition: GeneralBlockPanelKernel.h:2122
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_imag_op< typename Derived::Scalar >, const Derived > imag(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: XprHelper.h:35
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_real_op< typename Derived::Scalar >, const Derived > real(const Eigen::ArrayBase< Derived > &x)
Definition: Eigen_Colamd.h:50
Definition: Constants.h:322
std::ptrdiff_t l1CacheSize()
Definition: GeneralBlockPanelKernel.h:2113