// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H namespace Eigen { namespace internal { template class gebp_traits; /** \internal \returns b if a<=0, and returns a otherwise. */ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { return a<=0 ? b : a; } /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) { static std::ptrdiff_t m_l1CacheSize = 0; static std::ptrdiff_t m_l2CacheSize = 0; if(m_l2CacheSize==0) { m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); } if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; } else { eigen_internal_assert(false); } } /** \brief Computes the blocking parameters for a m x k times k x n matrix product * * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. * * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, * this function computes the blocking size parameters along the respective dimensions * for matrix products and related algorithms. The blocking sizes depends on various * parameters: * - the L1 and L2 cache sizes, * - the register level blocking sizes defined by gebp_traits, * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ template void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) { EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed // per kc x nr vertical small panels where nr is the blocking size along the n dimension // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. std::ptrdiff_t l1, l2; typedef gebp_traits Traits; enum { kdiv = KcFactor * 2 * Traits::nr * Traits::RhsProgress * sizeof(RhsScalar), mr = gebp_traits::mr, mr_mask = (0xffffffff/mr)*mr }; manage_caching_sizes(GetAction, &l1, &l2); k = std::min(k, l1/kdiv); SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; if(_m inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) { computeProductBlockingSizes(k, m, n); } #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else // FIXME (a bit overkill maybe ?) template struct gebp_madd_selector { EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) { c = cj.pmadd(a,b,c); } }; template struct gebp_madd_selector { EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) { t = b; t = cj.pmul(a,t); c = padd(c,t); } }; template EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) { gebp_madd_selector::run(cj,a,b,c,t); } #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); // #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif /* Vectorization logic * real*real: unpack rhs to constant packets, ... * * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), * storing each res packet into two packets (2x2), * at the end combine them: swap the second and addsub them * cf*cf : same but with 2x4 blocks * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ template class gebp_traits { public: typedef _LhsScalar LhsScalar; typedef _RhsScalar RhsScalar; typedef typename scalar_product_traits::ReturnType ResScalar; enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, LhsPacketSize = Vectorizable ? packet_traits::size : 1, RhsPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // register block size along the N direction (must be either 2 or 4) nr = NumberOfRegisters/4, // register block size along the M direction (currently, this one cannot be modified) mr = 2 * LhsPacketSize, WorkSpaceFactor = nr * RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = RhsPacketSize }; typedef typename packet_traits::type _LhsPacket; typedef typename packet_traits::type _RhsPacket; typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) { for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const { tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); } protected: // conj_helper cj; // conj_helper pcj; }; template class gebp_traits, RealScalar, _ConjLhs, false> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename scalar_product_traits::ReturnType ResScalar; enum { ConjLhs = _ConjLhs, ConjRhs = false, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, LhsPacketSize = Vectorizable ? packet_traits::size : 1, RhsPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = NumberOfRegisters/4, mr = 2 * LhsPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = RhsPacketSize }; typedef typename packet_traits::type _LhsPacket; typedef typename packet_traits::type _RhsPacket; typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) { for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const { c += a * b; } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = cj.pmadd(c,alpha,r); } protected: conj_helper cj; }; template class gebp_traits, std::complex, _ConjLhs, _ConjRhs > { public: typedef std::complex Scalar; typedef std::complex LhsScalar; typedef std::complex RhsScalar; typedef std::complex ResScalar; enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, RealPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, nr = 2, mr = 2 * ResPacketSize, WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, RhsProgress = Vectorizable ? 2*ResPacketSize : 1 }; typedef typename packet_traits::type RealPacket; typedef typename packet_traits::type ScalarPacket; struct DoublePacket { RealPacket first; RealPacket second; }; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) { p.first = pset1(RealScalar(0)); p.second = pset1(RealScalar(0)); } /* Unpack the rhs coeff such that each complex coefficient is spread into * two packects containing respectively the real and imaginary coefficient * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] */ EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) { for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); } else b[k] = rhs[k]; } } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { dest.first = pload((const RealScalar*)b); dest.second = pload((const RealScalar*)(b+ResPacketSize)); } // nothing special here EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload((const typename unpacket_traits::type*)(a)); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const { c = cj.pmadd(a,b,c); } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const { // assemble c ResPacket tmp; if((!ConjLhs)&&(!ConjRhs)) { tmp = pcplxflip(pconj(ResPacket(c.second))); tmp = padd(ResPacket(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { tmp = pconj(pcplxflip(ResPacket(c.second))); tmp = padd(ResPacket(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { tmp = pcplxflip(ResPacket(c.second)); tmp = padd(pconj(ResPacket(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { tmp = pcplxflip(ResPacket(c.second)); tmp = psub(pconj(ResPacket(c.first)),tmp); } r = pmadd(tmp,alpha,r); } protected: conj_helper cj; }; template class gebp_traits, false, _ConjRhs > { public: typedef std::complex Scalar; typedef RealScalar LhsScalar; typedef Scalar RhsScalar; typedef Scalar ResScalar; enum { ConjLhs = false, ConjRhs = _ConjRhs, Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, LhsPacketSize = Vectorizable ? packet_traits::size : 1, RhsPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, mr = 2*ResPacketSize, WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, RhsProgress = ResPacketSize }; typedef typename packet_traits::type _LhsPacket; typedef typename packet_traits::type _RhsPacket; typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) { for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup(a); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const { c += a * b; } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = cj.pmadd(alpha,c,r); } protected: conj_helper cj; }; /* optimized GEneral packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ template struct gebp_kernel { typedef gebp_traits Traits; typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, RhsProgress = Traits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); }; template EIGEN_DONT_INLINE void gebp_kernel ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) { Traits traits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; // conj_helper pcj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; // FIXME: const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); const Index peeled_kc = (depth/4)*4; if(unpackedB==0) unpackedB = const_cast(blockB - strideB * nr * RhsProgress); // loops on each micro vertical panel of rhs (depth x nr) for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely // stored into mr/packet_size x nr registers. for(Index i=0; i(alpha); R0 = ploadu(r0); R1 = ploadu(r1); R2 = ploadu(r2); R3 = ploadu(r3); R4 = ploadu(r0 + ResPacketSize); R5 = ploadu(r1 + ResPacketSize); R6 = ploadu(r2 + ResPacketSize); traits.acc(C0, alphav, R0); pstoreu(r0, R0); R0 = ploadu(r3 + ResPacketSize); traits.acc(C1, alphav, R1); traits.acc(C2, alphav, R2); traits.acc(C3, alphav, R3); traits.acc(C4, alphav, R4); traits.acc(C5, alphav, R5); traits.acc(C6, alphav, R6); traits.acc(C7, alphav, R0); pstoreu(r1, R1); pstoreu(r2, R2); pstoreu(r3, R3); pstoreu(r0 + ResPacketSize, R4); pstoreu(r1 + ResPacketSize, R5); pstoreu(r2 + ResPacketSize, R6); pstoreu(r3 + ResPacketSize, R0); } else { ResPacket R0, R1, R4; ResPacket alphav = pset1(alpha); R0 = ploadu(r0); R1 = ploadu(r1); R4 = ploadu(r0 + ResPacketSize); traits.acc(C0, alphav, R0); pstoreu(r0, R0); R0 = ploadu(r1 + ResPacketSize); traits.acc(C1, alphav, R1); traits.acc(C4, alphav, R4); traits.acc(C5, alphav, R0); pstoreu(r1, R1); pstoreu(r0 + ResPacketSize, R4); pstoreu(r1 + ResPacketSize, R0); } } if(rows-peeled_mc>=LhsProgress) { Index i = peeled_mc; const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; prefetch(&blA[0]); // gets res block as register AccPacket C0, C1, C2, C3; traits.initAcc(C0); traits.initAcc(C1); if(nr==4) traits.initAcc(C2); if(nr==4) traits.initAcc(C3); // performs "inner" product const RhsScalar* blB = unpackedB; for(Index k=0; k(alpha); ResScalar* r0 = &res[(j2+0)*resStride + i]; ResScalar* r1 = r0 + resStride; ResScalar* r2 = r1 + resStride; ResScalar* r3 = r2 + resStride; R0 = ploadu(r0); R1 = ploadu(r1); if(nr==4) R2 = ploadu(r2); if(nr==4) R3 = ploadu(r3); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); if(nr==4) traits.acc(C2, alphav, R2); if(nr==4) traits.acc(C3, alphav, R3); pstoreu(r0, R0); pstoreu(r1, R1); if(nr==4) pstoreu(r2, R2); if(nr==4) pstoreu(r3, R3); } for(Index i=peeled_mc2; i do the same but with nr==1 for(Index j2=packet_cols; j2(alpha); ResScalar* r0 = &res[(j2+0)*resStride + i]; R0 = ploadu(r0); R4 = ploadu(r0+ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R4); pstoreu(r0, R0); pstoreu(r0+ResPacketSize, R4); } if(rows-peeled_mc>=LhsProgress) { Index i = peeled_mc; const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; prefetch(&blA[0]); AccPacket C0; traits.initAcc(C0); const RhsScalar* blB = unpackedB; for(Index k=0; k(alpha); ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); traits.acc(C0, alphav, R0); pstoreu(&res[(j2+0)*resStride + i], R0); } for(Index i=peeled_mc2; i struct gemm_pack_lhs { EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); }; template EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride) EIGEN_UNUSED_VARIABLE(offset) eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); conj_if::IsComplex && Conjugate> cj; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; Index peeled_mc = (rows/Pack1)*Pack1; for(Index i=0; i=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } } } else { for(Index k=0; k=Pack2) { if(PanelMode) count += Pack2*offset; for(Index k=0; k struct gemm_pack_rhs { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); }; template EIGEN_DONT_INLINE void gemm_pack_rhs ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); EIGEN_UNUSED_VARIABLE(stride) EIGEN_UNUSED_VARIABLE(offset) eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2 struct gemm_pack_rhs { enum { PacketSize = packet_traits::size }; EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); }; template EIGEN_DONT_INLINE void gemm_pack_rhs ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); EIGEN_UNUSED_VARIABLE(stride) EIGEN_UNUSED_VARIABLE(offset) eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2