// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H namespace Eigen { namespace internal { /* Optimized col-major matrix * vector product: * This algorithm processes 4 columns at onces that allows to both reduce * the number of load/stores of the result by a factor 4 and to reduce * the instruction dependency. Moreover, we know that all bands have the * same alignment pattern. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments * |real |cplx |cplx | no vectorization * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * * Accesses to the matrix coefficients follow the following logic: * * - if all columns have the same alignment then * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) * - otherwise perform unaligned loads only (-> NoneAligned case) * - otherwise * - if even columns have the same alignment then * // odd columns are guaranteed to have the same alignment too * - if even or odd columns have the same alignment as the result, then * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) * - perform half aligned and half unaligned loads (-> EvenAligned case) * - otherwise perform unaligned loads only (-> NoneAligned case) * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h * - otherwise, * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), * // we currently fall back to the NoneAligned case * * The same reasoning apply for the transposed case. * * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow * compared to unaligned loads on a 4 byte boundary. * */ template struct general_matrix_vector_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; enum { Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable && int(packet_traits::size)==int(packet_traits::size), LhsPacketSize = Vectorizable ? packet_traits::size : 1, RhsPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1 }; typedef typename packet_traits::type _LhsPacket; typedef typename packet_traits::type _RhsPacket; typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha); }; template EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ pstore(&res[j], \ padd(pload(&res[j]), \ padd( \ padd(pcj.pmul(lhs0.template load(j), ptmp0), \ pcj.pmul(lhs1.template load(j), ptmp1)), \ padd(pcj.pmul(lhs2.template load(j), ptmp2), \ pcj.pmul(lhs3.template load(j), ptmp3)) ))) typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper cj; conj_helper pcj; if(ConjugateRhs) alpha = numext::conj(alpha); enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const Index columnsAtOnce = 4; const Index peels = 2; const Index LhsPacketAlignedMask = LhsPacketSize-1; const Index ResPacketAlignedMask = ResPacketSize-1; // const Index PeelAlignedMask = ResPacketSize*peels-1; const Index size = rows; const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. Index alignedStart = internal::first_default_aligned(res,size); Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const Index lhsAlignmentOffset = lhs.firstAligned(size); // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; alignmentPattern = NoneAligned; } else if(LhsPacketSize > 4) { // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. // Currently, it seems to be better to perform unaligned loads anyway alignmentPattern = NoneAligned; } else if (LhsPacketSize>1) { // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size= cols) || LhsPacketSize > size || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { alignedStart = 0; alignedSize = size; alignmentPattern = AllAligned; } const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i(alpha*rhs(i, 0)), ptmp1 = pset1(alpha*rhs(i+offset1, 0)), ptmp2 = pset1(alpha*rhs(i+2, 0)), ptmp3 = pset1(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { /* explicit vectorization */ // process initial unaligned coeffs for (Index j=0; jalignedStart) { switch(alignmentPattern) { case AllAligned: for (Index j = alignedStart; j1) { LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; ResPacket T0, T1; A01 = lhs1.template load(alignedStart-1); A02 = lhs2.template load(alignedStart-2); A03 = lhs3.template load(alignedStart-3); for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); A00 = lhs0.template load(j); A10 = lhs0.template load(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); pstore(&res[j+ResPacketSize],T1); } } for (; j(alpha*rhs(k, 0)); const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j(alignedStart)) for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); else for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i struct general_matrix_vector_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; enum { Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable && int(packet_traits::size)==int(packet_traits::size), LhsPacketSize = Vectorizable ? packet_traits::size : 1, RhsPacketSize = Vectorizable ? packet_traits::size : 1, ResPacketSize = Vectorizable ? packet_traits::size : 1 }; typedef typename packet_traits::type _LhsPacket; typedef typename packet_traits::type _RhsPacket; typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha); }; template EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { eigen_internal_assert(rhs.stride()==1); #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } conj_helper cj; conj_helper pcj; typedef typename LhsMapper::VectorMapper LhsScalars; enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; const Index depth = cols; const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned : alignmentStep==(LhsPacketSize/2) ? EvenAligned : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices const Index lhsAlignmentOffset = lhs.firstAligned(depth); const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) { alignedSize = 0; alignedStart = 0; alignmentPattern = NoneAligned; } else if(LhsPacketSize > 4) { // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. alignmentPattern = NoneAligned; } else if (LhsPacketSize>1) { // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth= rows) || LhsPacketSize > depth || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { alignedStart = 0; alignedSize = depth; alignmentPattern = AllAligned; } const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); // process initial unaligned coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; jalignedStart) { switch(alignmentPattern) { case AllAligned: for (Index j = alignedStart; j1) { /* Here we proccess 4 rows with with two peeled iterations to hide * the overhead of unaligned loads. Moreover unaligned loads are handled * using special shift/move operations between the two aligned packets * overlaping the desired unaligned packet. This is *much* more efficient * than basic unaligned loads. */ LhsPacket A01, A02, A03, A11, A12, A13; A01 = lhs1.template load(alignedStart-1); A02 = lhs2.template load(alignedStart-2); A03 = lhs3.template load(alignedStart-3); for (; j(0); A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j(tmp0); const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; jalignedStart) { // process aligned rhs coeffs if (lhs0.template aligned(alignedStart)) for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); else for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j