Eigen | Mbed

Users » pmic » Code » Eigen » Documentation
Embed: (wiki syntax)
Show/hide line numbers GeneralMatrixVector.h Source File
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
00011 #define EIGEN_GENERAL_MATRIX_VECTOR_H
00012 
00013 namespace Eigen { 
00014 
00015 namespace internal {
00016 
00017 /* Optimized col-major matrix * vector product:
00018  * This algorithm processes 4 columns at onces that allows to both reduce
00019  * the number of load/stores of the result by a factor 4 and to reduce
00020  * the instruction dependency. Moreover, we know that all bands have the
00021  * same alignment pattern.
00022  *
00023  * Mixing type logic: C += alpha * A * B
00024  *  |  A  |  B  |alpha| comments
00025  *  |real |cplx |cplx | no vectorization
00026  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
00027  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
00028  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
00029  */
00030 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00031 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00032 {
00033 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00034 
00035 enum {
00036   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00037               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00038   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00039   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00040   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00041 };
00042 
00043 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00044 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00045 typedef typename packet_traits<ResScalar>::type  _ResPacket;
00046 
00047 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00048 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00049 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00050 
00051 EIGEN_DONT_INLINE static void run(
00052   Index rows, Index cols,
00053   const LhsScalar* lhs, Index lhsStride,
00054   const RhsScalar* rhs, Index rhsIncr,
00055   ResScalar* res, Index resIncr, RhsScalar alpha);
00056 };
00057 
00058 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00059 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
00060   Index rows, Index cols,
00061   const LhsScalar* lhs, Index lhsStride,
00062   const RhsScalar* rhs, Index rhsIncr,
00063   ResScalar* res, Index resIncr, RhsScalar alpha)
00064 {
00065   EIGEN_UNUSED_VARIABLE(resIncr)
00066   eigen_internal_assert(resIncr==1);
00067   #ifdef _EIGEN_ACCUMULATE_PACKETS
00068   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00069   #endif
00070   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
00071     pstore(&res[j], \
00072       padd(pload<ResPacket>(&res[j]), \
00073         padd( \
00074           padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
00075                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
00076           padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
00077                   pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
00078 
00079   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00080   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00081   if(ConjugateRhs)
00082     alpha = numext::conj(alpha);
00083 
00084   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
00085   const Index columnsAtOnce = 4;
00086   const Index peels = 2;
00087   const Index LhsPacketAlignedMask = LhsPacketSize-1;
00088   const Index ResPacketAlignedMask = ResPacketSize-1;
00089 //  const Index PeelAlignedMask = ResPacketSize*peels-1;
00090   const Index size = rows;
00091   
00092   // How many coeffs of the result do we have to skip to be aligned.
00093   // Here we assume data are at least aligned on the base scalar type.
00094   Index alignedStart = internal::first_aligned(res,size);
00095   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
00096   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00097 
00098   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00099   Index alignmentPattern = alignmentStep==0 ? AllAligned
00100                        : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00101                        : FirstAligned;
00102 
00103   // we cannot assume the first element is aligned because of sub-matrices
00104   const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
00105 
00106   // find how many columns do we have to skip to be aligned with the result (if possible)
00107   Index skipColumns = 0;
00108   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
00109   if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
00110   {
00111     alignedSize = 0;
00112     alignedStart = 0;
00113   }
00114   else if (LhsPacketSize>1)
00115   {
00116     eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
00117 
00118     while (skipColumns<LhsPacketSize &&
00119           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
00120       ++skipColumns;
00121     if (skipColumns==LhsPacketSize)
00122     {
00123       // nothing can be aligned, no need to skip any column
00124       alignmentPattern = NoneAligned;
00125       skipColumns = 0;
00126     }
00127     else
00128     {
00129       skipColumns = (std::min)(skipColumns,cols);
00130       // note that the skiped columns are processed later.
00131     }
00132 
00133     eigen_internal_assert(  (alignmentPattern==NoneAligned)
00134                       || (skipColumns + columnsAtOnce >= cols)
00135                       || LhsPacketSize > size
00136                       || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
00137   }
00138   else if(Vectorizable)
00139   {
00140     alignedStart = 0;
00141     alignedSize = size;
00142     alignmentPattern = AllAligned;
00143   }
00144 
00145   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00146   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00147 
00148   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
00149   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
00150   {
00151     RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
00152               ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
00153               ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
00154               ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
00155 
00156     // this helps a lot generating better binary code
00157     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
00158                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00159 
00160     if (Vectorizable)
00161     {
00162       /* explicit vectorization */
00163       // process initial unaligned coeffs
00164       for (Index j=0; j<alignedStart; ++j)
00165       {
00166         res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00167         res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00168         res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00169         res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00170       }
00171 
00172       if (alignedSize>alignedStart)
00173       {
00174         switch(alignmentPattern)
00175         {
00176           case AllAligned:
00177             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00178               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00179             break;
00180           case EvenAligned:
00181             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00182               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00183             break;
00184           case FirstAligned:
00185           {
00186             Index j = alignedStart;
00187             if(peels>1)
00188             {
00189               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
00190               ResPacket T0, T1;
00191 
00192               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00193               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00194               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00195 
00196               for (; j<peeledSize; j+=peels*ResPacketSize)
00197               {
00198                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
00199                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
00200                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
00201 
00202                 A00 = pload<LhsPacket>(&lhs0[j]);
00203                 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
00204                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
00205                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
00206 
00207                 T0  = pcj.pmadd(A01, ptmp1, T0);
00208                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
00209                 T0  = pcj.pmadd(A02, ptmp2, T0);
00210                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
00211                 T0  = pcj.pmadd(A03, ptmp3, T0);
00212                 pstore(&res[j],T0);
00213                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
00214                 T1  = pcj.pmadd(A11, ptmp1, T1);
00215                 T1  = pcj.pmadd(A12, ptmp2, T1);
00216                 T1  = pcj.pmadd(A13, ptmp3, T1);
00217                 pstore(&res[j+ResPacketSize],T1);
00218               }
00219             }
00220             for (; j<alignedSize; j+=ResPacketSize)
00221               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00222             break;
00223           }
00224           default:
00225             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
00226               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00227             break;
00228         }
00229       }
00230     } // end explicit vectorization
00231 
00232     /* process remaining coeffs (or all if there is no explicit vectorization) */
00233     for (Index j=alignedSize; j<size; ++j)
00234     {
00235       res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
00236       res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
00237       res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
00238       res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
00239     }
00240   }
00241 
00242   // process remaining first and last columns (at most columnsAtOnce-1)
00243   Index end = cols;
00244   Index start = columnBound;
00245   do
00246   {
00247     for (Index k=start; k<end; ++k)
00248     {
00249       RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
00250       const LhsScalar* lhs0 = lhs + k*lhsStride;
00251 
00252       if (Vectorizable)
00253       {
00254         /* explicit vectorization */
00255         // process first unaligned result's coeffs
00256         for (Index j=0; j<alignedStart; ++j)
00257           res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
00258         // process aligned result's coeffs
00259         if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00260           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00261             pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00262         else
00263           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
00264             pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
00265       }
00266 
00267       // process remaining scalars (or all if no explicit vectorization)
00268       for (Index i=alignedSize; i<size; ++i)
00269         res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
00270     }
00271     if (skipColumns)
00272     {
00273       start = 0;
00274       end = skipColumns;
00275       skipColumns = 0;
00276     }
00277     else
00278       break;
00279   } while(Vectorizable);
00280   #undef _EIGEN_ACCUMULATE_PACKETS
00281 }
00282 
00283 /* Optimized row-major matrix * vector product:
00284  * This algorithm processes 4 rows at onces that allows to both reduce
00285  * the number of load/stores of the result by a factor 4 and to reduce
00286  * the instruction dependency. Moreover, we know that all bands have the
00287  * same alignment pattern.
00288  *
00289  * Mixing type logic:
00290  *  - alpha is always a complex (or converted to a complex)
00291  *  - no vectorization
00292  */
00293 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00294 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
00295 {
00296 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00297 
00298 enum {
00299   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
00300               && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
00301   LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00302   RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00303   ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
00304 };
00305 
00306 typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
00307 typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
00308 typedef typename packet_traits<ResScalar>::type  _ResPacket;
00309 
00310 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00311 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00312 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00313   
00314 EIGEN_DONT_INLINE static void run(
00315   Index rows, Index cols,
00316   const LhsScalar* lhs, Index lhsStride,
00317   const RhsScalar* rhs, Index rhsIncr,
00318   ResScalar* res, Index resIncr,
00319   ResScalar alpha);
00320 };
00321 
00322 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
00323 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
00324   Index rows, Index cols,
00325   const LhsScalar* lhs, Index lhsStride,
00326   const RhsScalar* rhs, Index rhsIncr,
00327   ResScalar* res, Index resIncr,
00328   ResScalar alpha)
00329 {
00330   EIGEN_UNUSED_VARIABLE(rhsIncr);
00331   eigen_internal_assert(rhsIncr==1);
00332   #ifdef _EIGEN_ACCUMULATE_PACKETS
00333   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
00334   #endif
00335 
00336   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
00337     RhsPacket b = pload<RhsPacket>(&rhs[j]); \
00338     ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
00339     ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
00340     ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
00341     ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
00342 
00343   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00344   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
00345 
00346   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
00347   const Index rowsAtOnce = 4;
00348   const Index peels = 2;
00349   const Index RhsPacketAlignedMask = RhsPacketSize-1;
00350   const Index LhsPacketAlignedMask = LhsPacketSize-1;
00351 //   const Index PeelAlignedMask = RhsPacketSize*peels-1;
00352   const Index depth = cols;
00353 
00354   // How many coeffs of the result do we have to skip to be aligned.
00355   // Here we assume data are at least aligned on the base scalar type
00356   // if that's not the case then vectorization is discarded, see below.
00357   Index alignedStart = internal::first_aligned(rhs, depth);
00358   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
00359   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
00360 
00361   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
00362   Index alignmentPattern = alignmentStep==0 ? AllAligned
00363                          : alignmentStep==(LhsPacketSize/2) ? EvenAligned
00364                          : FirstAligned;
00365 
00366   // we cannot assume the first element is aligned because of sub-matrices
00367   const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
00368 
00369   // find how many rows do we have to skip to be aligned with rhs (if possible)
00370   Index skipRows = 0;
00371   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
00372   if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
00373   {
00374     alignedSize = 0;
00375     alignedStart = 0;
00376   }
00377   else if (LhsPacketSize>1)
00378   {
00379     eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
00380 
00381     while (skipRows<LhsPacketSize &&
00382            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
00383       ++skipRows;
00384     if (skipRows==LhsPacketSize)
00385     {
00386       // nothing can be aligned, no need to skip any column
00387       alignmentPattern = NoneAligned;
00388       skipRows = 0;
00389     }
00390     else
00391     {
00392       skipRows = (std::min)(skipRows,Index(rows));
00393       // note that the skiped columns are processed later.
00394     }
00395     eigen_internal_assert(  alignmentPattern==NoneAligned
00396                       || LhsPacketSize==1
00397                       || (skipRows + rowsAtOnce >= rows)
00398                       || LhsPacketSize > depth
00399                       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
00400   }
00401   else if(Vectorizable)
00402   {
00403     alignedStart = 0;
00404     alignedSize = depth;
00405     alignmentPattern = AllAligned;
00406   }
00407 
00408   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
00409   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
00410 
00411   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
00412   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
00413   {
00414     EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00415     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
00416 
00417     // this helps the compiler generating good binary code
00418     const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
00419                     *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
00420 
00421     if (Vectorizable)
00422     {
00423       /* explicit vectorization */
00424       ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
00425                 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
00426 
00427       // process initial unaligned coeffs
00428       // FIXME this loop get vectorized by the compiler !
00429       for (Index j=0; j<alignedStart; ++j)
00430       {
00431         RhsScalar b = rhs[j];
00432         tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00433         tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00434       }
00435 
00436       if (alignedSize>alignedStart)
00437       {
00438         switch(alignmentPattern)
00439         {
00440           case AllAligned:
00441             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00442               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
00443             break;
00444           case EvenAligned:
00445             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00446               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
00447             break;
00448           case FirstAligned:
00449           {
00450             Index j = alignedStart;
00451             if (peels>1)
00452             {
00453               /* Here we proccess 4 rows with with two peeled iterations to hide
00454                * the overhead of unaligned loads. Moreover unaligned loads are handled
00455                * using special shift/move operations between the two aligned packets
00456                * overlaping the desired unaligned packet. This is *much* more efficient
00457                * than basic unaligned loads.
00458                */
00459               LhsPacket A01, A02, A03, A11, A12, A13;
00460               A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
00461               A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
00462               A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
00463 
00464               for (; j<peeledSize; j+=peels*RhsPacketSize)
00465               {
00466                 RhsPacket b = pload<RhsPacket>(&rhs[j]);
00467                 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
00468                 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
00469                 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
00470 
00471                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
00472                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
00473                 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
00474                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
00475                 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
00476                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
00477                 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
00478 
00479                 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
00480                 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
00481                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
00482                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
00483                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
00484               }
00485             }
00486             for (; j<alignedSize; j+=RhsPacketSize)
00487               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
00488             break;
00489           }
00490           default:
00491             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
00492               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
00493             break;
00494         }
00495         tmp0 += predux(ptmp0);
00496         tmp1 += predux(ptmp1);
00497         tmp2 += predux(ptmp2);
00498         tmp3 += predux(ptmp3);
00499       }
00500     } // end explicit vectorization
00501 
00502     // process remaining coeffs (or all if no explicit vectorization)
00503     // FIXME this loop get vectorized by the compiler !
00504     for (Index j=alignedSize; j<depth; ++j)
00505     {
00506       RhsScalar b = rhs[j];
00507       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
00508       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
00509     }
00510     res[i*resIncr]            += alpha*tmp0;
00511     res[(i+offset1)*resIncr]  += alpha*tmp1;
00512     res[(i+2)*resIncr]        += alpha*tmp2;
00513     res[(i+offset3)*resIncr]  += alpha*tmp3;
00514   }
00515 
00516   // process remaining first and last rows (at most columnsAtOnce-1)
00517   Index end = rows;
00518   Index start = rowBound;
00519   do
00520   {
00521     for (Index i=start; i<end; ++i)
00522     {
00523       EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
00524       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
00525       const LhsScalar* lhs0 = lhs + i*lhsStride;
00526       // process first unaligned result's coeffs
00527       // FIXME this loop get vectorized by the compiler !
00528       for (Index j=0; j<alignedStart; ++j)
00529         tmp0 += cj.pmul(lhs0[j], rhs[j]);
00530 
00531       if (alignedSize>alignedStart)
00532       {
00533         // process aligned rhs coeffs
00534         if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
00535           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00536             ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00537         else
00538           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
00539             ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
00540         tmp0 += predux(ptmp0);
00541       }
00542 
00543       // process remaining scalars
00544       // FIXME this loop get vectorized by the compiler !
00545       for (Index j=alignedSize; j<depth; ++j)
00546         tmp0 += cj.pmul(lhs0[j], rhs[j]);
00547       res[i*resIncr] += alpha*tmp0;
00548     }
00549     if (skipRows)
00550     {
00551       start = 0;
00552       end = skipRows;
00553       skipRows = 0;
00554     }
00555     else
00556       break;
00557   } while(Vectorizable);
00558 
00559   #undef _EIGEN_ACCUMULATE_PACKETS
00560 }
00561 
00562 } // end namespace internal
00563 
00564 } // end namespace Eigen
00565 
00566 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
Repository toolbox

Repository details

Type:	Library
Created:	17 Nov 2022
Imports:	1
Forks:	0
Commits:	2
Dependents:	0
Dependencies:	0
Followers:	1
Important changes to repositories hosted on mbed.com

GeneralMatrixVector.h

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

GeneralMatrixVector.h

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning