Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
GeneralMatrixVector.h
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H 00011 #define EIGEN_GENERAL_MATRIX_VECTOR_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 /* Optimized col-major matrix * vector product: 00018 * This algorithm processes 4 columns at onces that allows to both reduce 00019 * the number of load/stores of the result by a factor 4 and to reduce 00020 * the instruction dependency. Moreover, we know that all bands have the 00021 * same alignment pattern. 00022 * 00023 * Mixing type logic: C += alpha * A * B 00024 * | A | B |alpha| comments 00025 * |real |cplx |cplx | no vectorization 00026 * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization 00027 * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp 00028 * |cplx |real |real | optimal case, vectorization possible via real-cplx mul 00029 */ 00030 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 00031 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> 00032 { 00033 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 00034 00035 enum { 00036 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable 00037 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), 00038 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00039 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00040 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 00041 }; 00042 00043 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00044 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00045 typedef typename packet_traits<ResScalar>::type _ResPacket; 00046 00047 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00048 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00049 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00050 00051 EIGEN_DONT_INLINE static void run( 00052 Index rows, Index cols, 00053 const LhsScalar* lhs, Index lhsStride, 00054 const RhsScalar* rhs, Index rhsIncr, 00055 ResScalar* res, Index resIncr, RhsScalar alpha); 00056 }; 00057 00058 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 00059 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( 00060 Index rows, Index cols, 00061 const LhsScalar* lhs, Index lhsStride, 00062 const RhsScalar* rhs, Index rhsIncr, 00063 ResScalar* res, Index resIncr, RhsScalar alpha) 00064 { 00065 EIGEN_UNUSED_VARIABLE(resIncr) 00066 eigen_internal_assert(resIncr==1); 00067 #ifdef _EIGEN_ACCUMULATE_PACKETS 00068 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 00069 #endif 00070 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ 00071 pstore(&res[j], \ 00072 padd(pload<ResPacket>(&res[j]), \ 00073 padd( \ 00074 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ 00075 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ 00076 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ 00077 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) 00078 00079 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 00080 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 00081 if(ConjugateRhs) 00082 alpha = numext::conj(alpha); 00083 00084 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; 00085 const Index columnsAtOnce = 4; 00086 const Index peels = 2; 00087 const Index LhsPacketAlignedMask = LhsPacketSize-1; 00088 const Index ResPacketAlignedMask = ResPacketSize-1; 00089 // const Index PeelAlignedMask = ResPacketSize*peels-1; 00090 const Index size = rows; 00091 00092 // How many coeffs of the result do we have to skip to be aligned. 00093 // Here we assume data are at least aligned on the base scalar type. 00094 Index alignedStart = internal::first_aligned(res,size); 00095 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; 00096 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; 00097 00098 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; 00099 Index alignmentPattern = alignmentStep==0 ? AllAligned 00100 : alignmentStep==(LhsPacketSize/2) ? EvenAligned 00101 : FirstAligned; 00102 00103 // we cannot assume the first element is aligned because of sub-matrices 00104 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); 00105 00106 // find how many columns do we have to skip to be aligned with the result (if possible) 00107 Index skipColumns = 0; 00108 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) 00109 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) 00110 { 00111 alignedSize = 0; 00112 alignedStart = 0; 00113 } 00114 else if (LhsPacketSize>1) 00115 { 00116 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); 00117 00118 while (skipColumns<LhsPacketSize && 00119 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) 00120 ++skipColumns; 00121 if (skipColumns==LhsPacketSize) 00122 { 00123 // nothing can be aligned, no need to skip any column 00124 alignmentPattern = NoneAligned; 00125 skipColumns = 0; 00126 } 00127 else 00128 { 00129 skipColumns = (std::min)(skipColumns,cols); 00130 // note that the skiped columns are processed later. 00131 } 00132 00133 eigen_internal_assert( (alignmentPattern==NoneAligned) 00134 || (skipColumns + columnsAtOnce >= cols) 00135 || LhsPacketSize > size 00136 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); 00137 } 00138 else if(Vectorizable) 00139 { 00140 alignedStart = 0; 00141 alignedSize = size; 00142 alignmentPattern = AllAligned; 00143 } 00144 00145 Index offset1 = (FirstAligned && alignmentStep==1?3:1); 00146 Index offset3 = (FirstAligned && alignmentStep==1?1:3); 00147 00148 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; 00149 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) 00150 { 00151 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), 00152 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), 00153 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), 00154 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); 00155 00156 // this helps a lot generating better binary code 00157 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, 00158 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; 00159 00160 if (Vectorizable) 00161 { 00162 /* explicit vectorization */ 00163 // process initial unaligned coeffs 00164 for (Index j=0; j<alignedStart; ++j) 00165 { 00166 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); 00167 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); 00168 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); 00169 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); 00170 } 00171 00172 if (alignedSize>alignedStart) 00173 { 00174 switch(alignmentPattern) 00175 { 00176 case AllAligned: 00177 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 00178 _EIGEN_ACCUMULATE_PACKETS(d,d,d); 00179 break; 00180 case EvenAligned: 00181 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 00182 _EIGEN_ACCUMULATE_PACKETS(d,du,d); 00183 break; 00184 case FirstAligned: 00185 { 00186 Index j = alignedStart; 00187 if(peels>1) 00188 { 00189 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; 00190 ResPacket T0, T1; 00191 00192 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); 00193 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); 00194 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); 00195 00196 for (; j<peeledSize; j+=peels*ResPacketSize) 00197 { 00198 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); 00199 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); 00200 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); 00201 00202 A00 = pload<LhsPacket>(&lhs0[j]); 00203 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); 00204 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); 00205 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); 00206 00207 T0 = pcj.pmadd(A01, ptmp1, T0); 00208 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); 00209 T0 = pcj.pmadd(A02, ptmp2, T0); 00210 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); 00211 T0 = pcj.pmadd(A03, ptmp3, T0); 00212 pstore(&res[j],T0); 00213 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); 00214 T1 = pcj.pmadd(A11, ptmp1, T1); 00215 T1 = pcj.pmadd(A12, ptmp2, T1); 00216 T1 = pcj.pmadd(A13, ptmp3, T1); 00217 pstore(&res[j+ResPacketSize],T1); 00218 } 00219 } 00220 for (; j<alignedSize; j+=ResPacketSize) 00221 _EIGEN_ACCUMULATE_PACKETS(d,du,du); 00222 break; 00223 } 00224 default: 00225 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 00226 _EIGEN_ACCUMULATE_PACKETS(du,du,du); 00227 break; 00228 } 00229 } 00230 } // end explicit vectorization 00231 00232 /* process remaining coeffs (or all if there is no explicit vectorization) */ 00233 for (Index j=alignedSize; j<size; ++j) 00234 { 00235 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); 00236 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); 00237 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); 00238 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); 00239 } 00240 } 00241 00242 // process remaining first and last columns (at most columnsAtOnce-1) 00243 Index end = cols; 00244 Index start = columnBound; 00245 do 00246 { 00247 for (Index k=start; k<end; ++k) 00248 { 00249 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); 00250 const LhsScalar* lhs0 = lhs + k*lhsStride; 00251 00252 if (Vectorizable) 00253 { 00254 /* explicit vectorization */ 00255 // process first unaligned result's coeffs 00256 for (Index j=0; j<alignedStart; ++j) 00257 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); 00258 // process aligned result's coeffs 00259 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) 00260 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) 00261 pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); 00262 else 00263 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) 00264 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); 00265 } 00266 00267 // process remaining scalars (or all if no explicit vectorization) 00268 for (Index i=alignedSize; i<size; ++i) 00269 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); 00270 } 00271 if (skipColumns) 00272 { 00273 start = 0; 00274 end = skipColumns; 00275 skipColumns = 0; 00276 } 00277 else 00278 break; 00279 } while(Vectorizable); 00280 #undef _EIGEN_ACCUMULATE_PACKETS 00281 } 00282 00283 /* Optimized row-major matrix * vector product: 00284 * This algorithm processes 4 rows at onces that allows to both reduce 00285 * the number of load/stores of the result by a factor 4 and to reduce 00286 * the instruction dependency. Moreover, we know that all bands have the 00287 * same alignment pattern. 00288 * 00289 * Mixing type logic: 00290 * - alpha is always a complex (or converted to a complex) 00291 * - no vectorization 00292 */ 00293 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 00294 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> 00295 { 00296 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 00297 00298 enum { 00299 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable 00300 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), 00301 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00302 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00303 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 00304 }; 00305 00306 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00307 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00308 typedef typename packet_traits<ResScalar>::type _ResPacket; 00309 00310 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00311 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00312 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00313 00314 EIGEN_DONT_INLINE static void run( 00315 Index rows, Index cols, 00316 const LhsScalar* lhs, Index lhsStride, 00317 const RhsScalar* rhs, Index rhsIncr, 00318 ResScalar* res, Index resIncr, 00319 ResScalar alpha); 00320 }; 00321 00322 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 00323 EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( 00324 Index rows, Index cols, 00325 const LhsScalar* lhs, Index lhsStride, 00326 const RhsScalar* rhs, Index rhsIncr, 00327 ResScalar* res, Index resIncr, 00328 ResScalar alpha) 00329 { 00330 EIGEN_UNUSED_VARIABLE(rhsIncr); 00331 eigen_internal_assert(rhsIncr==1); 00332 #ifdef _EIGEN_ACCUMULATE_PACKETS 00333 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 00334 #endif 00335 00336 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ 00337 RhsPacket b = pload<RhsPacket>(&rhs[j]); \ 00338 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ 00339 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ 00340 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ 00341 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } 00342 00343 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 00344 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 00345 00346 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; 00347 const Index rowsAtOnce = 4; 00348 const Index peels = 2; 00349 const Index RhsPacketAlignedMask = RhsPacketSize-1; 00350 const Index LhsPacketAlignedMask = LhsPacketSize-1; 00351 // const Index PeelAlignedMask = RhsPacketSize*peels-1; 00352 const Index depth = cols; 00353 00354 // How many coeffs of the result do we have to skip to be aligned. 00355 // Here we assume data are at least aligned on the base scalar type 00356 // if that's not the case then vectorization is discarded, see below. 00357 Index alignedStart = internal::first_aligned(rhs, depth); 00358 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; 00359 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; 00360 00361 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; 00362 Index alignmentPattern = alignmentStep==0 ? AllAligned 00363 : alignmentStep==(LhsPacketSize/2) ? EvenAligned 00364 : FirstAligned; 00365 00366 // we cannot assume the first element is aligned because of sub-matrices 00367 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); 00368 00369 // find how many rows do we have to skip to be aligned with rhs (if possible) 00370 Index skipRows = 0; 00371 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) 00372 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) 00373 { 00374 alignedSize = 0; 00375 alignedStart = 0; 00376 } 00377 else if (LhsPacketSize>1) 00378 { 00379 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); 00380 00381 while (skipRows<LhsPacketSize && 00382 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) 00383 ++skipRows; 00384 if (skipRows==LhsPacketSize) 00385 { 00386 // nothing can be aligned, no need to skip any column 00387 alignmentPattern = NoneAligned; 00388 skipRows = 0; 00389 } 00390 else 00391 { 00392 skipRows = (std::min)(skipRows,Index(rows)); 00393 // note that the skiped columns are processed later. 00394 } 00395 eigen_internal_assert( alignmentPattern==NoneAligned 00396 || LhsPacketSize==1 00397 || (skipRows + rowsAtOnce >= rows) 00398 || LhsPacketSize > depth 00399 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); 00400 } 00401 else if(Vectorizable) 00402 { 00403 alignedStart = 0; 00404 alignedSize = depth; 00405 alignmentPattern = AllAligned; 00406 } 00407 00408 Index offset1 = (FirstAligned && alignmentStep==1?3:1); 00409 Index offset3 = (FirstAligned && alignmentStep==1?1:3); 00410 00411 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; 00412 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) 00413 { 00414 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); 00415 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); 00416 00417 // this helps the compiler generating good binary code 00418 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, 00419 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; 00420 00421 if (Vectorizable) 00422 { 00423 /* explicit vectorization */ 00424 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)), 00425 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0)); 00426 00427 // process initial unaligned coeffs 00428 // FIXME this loop get vectorized by the compiler ! 00429 for (Index j=0; j<alignedStart; ++j) 00430 { 00431 RhsScalar b = rhs[j]; 00432 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); 00433 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); 00434 } 00435 00436 if (alignedSize>alignedStart) 00437 { 00438 switch(alignmentPattern) 00439 { 00440 case AllAligned: 00441 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 00442 _EIGEN_ACCUMULATE_PACKETS(d,d,d); 00443 break; 00444 case EvenAligned: 00445 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 00446 _EIGEN_ACCUMULATE_PACKETS(d,du,d); 00447 break; 00448 case FirstAligned: 00449 { 00450 Index j = alignedStart; 00451 if (peels>1) 00452 { 00453 /* Here we proccess 4 rows with with two peeled iterations to hide 00454 * the overhead of unaligned loads. Moreover unaligned loads are handled 00455 * using special shift/move operations between the two aligned packets 00456 * overlaping the desired unaligned packet. This is *much* more efficient 00457 * than basic unaligned loads. 00458 */ 00459 LhsPacket A01, A02, A03, A11, A12, A13; 00460 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); 00461 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); 00462 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); 00463 00464 for (; j<peeledSize; j+=peels*RhsPacketSize) 00465 { 00466 RhsPacket b = pload<RhsPacket>(&rhs[j]); 00467 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); 00468 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); 00469 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); 00470 00471 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); 00472 ptmp1 = pcj.pmadd(A01, b, ptmp1); 00473 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); 00474 ptmp2 = pcj.pmadd(A02, b, ptmp2); 00475 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); 00476 ptmp3 = pcj.pmadd(A03, b, ptmp3); 00477 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); 00478 00479 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); 00480 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); 00481 ptmp1 = pcj.pmadd(A11, b, ptmp1); 00482 ptmp2 = pcj.pmadd(A12, b, ptmp2); 00483 ptmp3 = pcj.pmadd(A13, b, ptmp3); 00484 } 00485 } 00486 for (; j<alignedSize; j+=RhsPacketSize) 00487 _EIGEN_ACCUMULATE_PACKETS(d,du,du); 00488 break; 00489 } 00490 default: 00491 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 00492 _EIGEN_ACCUMULATE_PACKETS(du,du,du); 00493 break; 00494 } 00495 tmp0 += predux(ptmp0); 00496 tmp1 += predux(ptmp1); 00497 tmp2 += predux(ptmp2); 00498 tmp3 += predux(ptmp3); 00499 } 00500 } // end explicit vectorization 00501 00502 // process remaining coeffs (or all if no explicit vectorization) 00503 // FIXME this loop get vectorized by the compiler ! 00504 for (Index j=alignedSize; j<depth; ++j) 00505 { 00506 RhsScalar b = rhs[j]; 00507 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); 00508 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); 00509 } 00510 res[i*resIncr] += alpha*tmp0; 00511 res[(i+offset1)*resIncr] += alpha*tmp1; 00512 res[(i+2)*resIncr] += alpha*tmp2; 00513 res[(i+offset3)*resIncr] += alpha*tmp3; 00514 } 00515 00516 // process remaining first and last rows (at most columnsAtOnce-1) 00517 Index end = rows; 00518 Index start = rowBound; 00519 do 00520 { 00521 for (Index i=start; i<end; ++i) 00522 { 00523 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); 00524 ResPacket ptmp0 = pset1<ResPacket>(tmp0); 00525 const LhsScalar* lhs0 = lhs + i*lhsStride; 00526 // process first unaligned result's coeffs 00527 // FIXME this loop get vectorized by the compiler ! 00528 for (Index j=0; j<alignedStart; ++j) 00529 tmp0 += cj.pmul(lhs0[j], rhs[j]); 00530 00531 if (alignedSize>alignedStart) 00532 { 00533 // process aligned rhs coeffs 00534 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) 00535 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) 00536 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); 00537 else 00538 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) 00539 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); 00540 tmp0 += predux(ptmp0); 00541 } 00542 00543 // process remaining scalars 00544 // FIXME this loop get vectorized by the compiler ! 00545 for (Index j=alignedSize; j<depth; ++j) 00546 tmp0 += cj.pmul(lhs0[j], rhs[j]); 00547 res[i*resIncr] += alpha*tmp0; 00548 } 00549 if (skipRows) 00550 { 00551 start = 0; 00552 end = skipRows; 00553 skipRows = 0; 00554 } 00555 else 00556 break; 00557 } while(Vectorizable); 00558 00559 #undef _EIGEN_ACCUMULATE_PACKETS 00560 } 00561 00562 } // end namespace internal 00563 00564 } // end namespace Eigen 00565 00566 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
Generated on Thu Nov 17 2022 22:01:28 by
1.7.2