Eigne Matrix Class Library
Dependents: Eigen_test Odometry_test AttitudeEstimation_usingTicker MPU9250_Quaternion_Binary_Serial ... more
GeneralBlockPanelKernel.h
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 00011 #define EIGEN_GENERAL_BLOCK_PANEL_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> 00018 class gebp_traits; 00019 00020 00021 /** \internal \returns b if a<=0, and returns a otherwise. */ 00022 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) 00023 { 00024 return a<=0 ? b : a; 00025 } 00026 00027 /** \internal */ 00028 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) 00029 { 00030 static std::ptrdiff_t m_l1CacheSize = 0; 00031 static std::ptrdiff_t m_l2CacheSize = 0; 00032 if(m_l2CacheSize==0) 00033 { 00034 m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); 00035 m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); 00036 } 00037 00038 if(action==SetAction) 00039 { 00040 // set the cpu cache size and cache all block sizes from a global cache size in byte 00041 eigen_internal_assert(l1!=0 && l2!=0); 00042 m_l1CacheSize = *l1; 00043 m_l2CacheSize = *l2; 00044 } 00045 else if(action==GetAction) 00046 { 00047 eigen_internal_assert(l1!=0 && l2!=0); 00048 *l1 = m_l1CacheSize; 00049 *l2 = m_l2CacheSize; 00050 } 00051 else 00052 { 00053 eigen_internal_assert(false); 00054 } 00055 } 00056 00057 /** \brief Computes the blocking parameters for a m x k times k x n matrix product 00058 * 00059 * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. 00060 * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. 00061 * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. 00062 * 00063 * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, 00064 * this function computes the blocking size parameters along the respective dimensions 00065 * for matrix products and related algorithms. The blocking sizes depends on various 00066 * parameters: 00067 * - the L1 and L2 cache sizes, 00068 * - the register level blocking sizes defined by gebp_traits, 00069 * - the number of scalars that fit into a packet (when vectorization is enabled). 00070 * 00071 * \sa setCpuCacheSizes */ 00072 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> 00073 void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) 00074 { 00075 EIGEN_UNUSED_VARIABLE(n); 00076 // Explanations: 00077 // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and 00078 // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed 00079 // per kc x nr vertical small panels where nr is the blocking size along the n dimension 00080 // at the register level. For vectorization purpose, these small vertical panels are unpacked, 00081 // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to 00082 // stay in L1 cache. 00083 std::ptrdiff_t l1, l2; 00084 00085 typedef gebp_traits<LhsScalar,RhsScalar> Traits; 00086 enum { 00087 kdiv = KcFactor * 2 * Traits::nr 00088 * Traits::RhsProgress * sizeof(RhsScalar), 00089 mr = gebp_traits<LhsScalar,RhsScalar>::mr, 00090 mr_mask = (0xffffffff/mr)*mr 00091 }; 00092 00093 manage_caching_sizes(GetAction, &l1, &l2); 00094 k = std::min<SizeType>(k, l1/kdiv); 00095 SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; 00096 if(_m<m) m = _m & mr_mask; 00097 } 00098 00099 template<typename LhsScalar, typename RhsScalar, typename SizeType> 00100 inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) 00101 { 00102 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); 00103 } 00104 00105 #ifdef EIGEN_HAS_FUSE_CJMADD 00106 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 00107 #else 00108 00109 // FIXME (a bit overkill maybe ?) 00110 00111 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector { 00112 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) 00113 { 00114 c = cj.pmadd(a,b,c); 00115 } 00116 }; 00117 00118 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> { 00119 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) 00120 { 00121 t = b; t = cj.pmul(a,t); c = padd(c,t); 00122 } 00123 }; 00124 00125 template<typename CJ, typename A, typename B, typename C, typename T> 00126 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) 00127 { 00128 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); 00129 } 00130 00131 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 00132 // #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); 00133 #endif 00134 00135 /* Vectorization logic 00136 * real*real: unpack rhs to constant packets, ... 00137 * 00138 * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), 00139 * storing each res packet into two packets (2x2), 00140 * at the end combine them: swap the second and addsub them 00141 * cf*cf : same but with 2x4 blocks 00142 * cplx*real : unpack rhs to constant packets, ... 00143 * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual 00144 */ 00145 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> 00146 class gebp_traits 00147 { 00148 public: 00149 typedef _LhsScalar LhsScalar; 00150 typedef _RhsScalar RhsScalar; 00151 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 00152 00153 enum { 00154 ConjLhs = _ConjLhs, 00155 ConjRhs = _ConjRhs, 00156 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, 00157 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00158 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00159 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00160 00161 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 00162 00163 // register block size along the N direction (must be either 2 or 4) 00164 nr = NumberOfRegisters/4, 00165 00166 // register block size along the M direction (currently, this one cannot be modified) 00167 mr = 2 * LhsPacketSize, 00168 00169 WorkSpaceFactor = nr * RhsPacketSize, 00170 00171 LhsProgress = LhsPacketSize, 00172 RhsProgress = RhsPacketSize 00173 }; 00174 00175 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00176 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00177 typedef typename packet_traits<ResScalar>::type _ResPacket; 00178 00179 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00180 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00181 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00182 00183 typedef ResPacket AccPacket; 00184 00185 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 00186 { 00187 p = pset1<ResPacket>(ResScalar(0)); 00188 } 00189 00190 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 00191 { 00192 for(DenseIndex k=0; k<n; k++) 00193 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 00194 } 00195 00196 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 00197 { 00198 dest = pload<RhsPacket>(b); 00199 } 00200 00201 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00202 { 00203 dest = pload<LhsPacket>(a); 00204 } 00205 00206 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const 00207 { 00208 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); 00209 } 00210 00211 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 00212 { 00213 r = pmadd(c,alpha,r); 00214 } 00215 00216 protected: 00217 // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; 00218 // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; 00219 }; 00220 00221 template<typename RealScalar, bool _ConjLhs> 00222 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> 00223 { 00224 public: 00225 typedef std::complex<RealScalar> LhsScalar; 00226 typedef RealScalar RhsScalar; 00227 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 00228 00229 enum { 00230 ConjLhs = _ConjLhs, 00231 ConjRhs = false, 00232 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, 00233 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00234 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00235 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00236 00237 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 00238 nr = NumberOfRegisters/4, 00239 mr = 2 * LhsPacketSize, 00240 WorkSpaceFactor = nr*RhsPacketSize, 00241 00242 LhsProgress = LhsPacketSize, 00243 RhsProgress = RhsPacketSize 00244 }; 00245 00246 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00247 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00248 typedef typename packet_traits<ResScalar>::type _ResPacket; 00249 00250 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00251 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00252 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00253 00254 typedef ResPacket AccPacket; 00255 00256 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 00257 { 00258 p = pset1<ResPacket>(ResScalar(0)); 00259 } 00260 00261 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 00262 { 00263 for(DenseIndex k=0; k<n; k++) 00264 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 00265 } 00266 00267 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 00268 { 00269 dest = pload<RhsPacket>(b); 00270 } 00271 00272 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00273 { 00274 dest = pload<LhsPacket>(a); 00275 } 00276 00277 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const 00278 { 00279 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); 00280 } 00281 00282 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const 00283 { 00284 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); 00285 } 00286 00287 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const 00288 { 00289 c += a * b; 00290 } 00291 00292 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 00293 { 00294 r = cj.pmadd(c,alpha,r); 00295 } 00296 00297 protected: 00298 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; 00299 }; 00300 00301 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> 00302 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > 00303 { 00304 public: 00305 typedef std::complex<RealScalar> Scalar; 00306 typedef std::complex<RealScalar> LhsScalar; 00307 typedef std::complex<RealScalar> RhsScalar; 00308 typedef std::complex<RealScalar> ResScalar; 00309 00310 enum { 00311 ConjLhs = _ConjLhs, 00312 ConjRhs = _ConjRhs, 00313 Vectorizable = packet_traits<RealScalar>::Vectorizable 00314 && packet_traits<Scalar>::Vectorizable, 00315 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, 00316 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00317 00318 nr = 2, 00319 mr = 2 * ResPacketSize, 00320 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, 00321 00322 LhsProgress = ResPacketSize, 00323 RhsProgress = Vectorizable ? 2*ResPacketSize : 1 00324 }; 00325 00326 typedef typename packet_traits<RealScalar>::type RealPacket; 00327 typedef typename packet_traits<Scalar>::type ScalarPacket; 00328 struct DoublePacket 00329 { 00330 RealPacket first; 00331 RealPacket second; 00332 }; 00333 00334 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; 00335 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; 00336 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; 00337 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; 00338 00339 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } 00340 00341 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) 00342 { 00343 p.first = pset1<RealPacket>(RealScalar(0)); 00344 p.second = pset1<RealPacket>(RealScalar(0)); 00345 } 00346 00347 /* Unpack the rhs coeff such that each complex coefficient is spread into 00348 * two packects containing respectively the real and imaginary coefficient 00349 * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] 00350 */ 00351 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) 00352 { 00353 for(DenseIndex k=0; k<n; k++) 00354 { 00355 if(Vectorizable) 00356 { 00357 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); 00358 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); 00359 } 00360 else 00361 b[k] = rhs[k]; 00362 } 00363 } 00364 00365 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } 00366 00367 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const 00368 { 00369 dest.first = pload<RealPacket>((const RealScalar*)b); 00370 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); 00371 } 00372 00373 // nothing special here 00374 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00375 { 00376 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); 00377 } 00378 00379 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const 00380 { 00381 c.first = padd(pmul(a,b.first), c.first); 00382 c.second = padd(pmul(a,b.second),c.second); 00383 } 00384 00385 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const 00386 { 00387 c = cj.pmadd(a,b,c); 00388 } 00389 00390 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } 00391 00392 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const 00393 { 00394 // assemble c 00395 ResPacket tmp; 00396 if((!ConjLhs)&&(!ConjRhs)) 00397 { 00398 tmp = pcplxflip(pconj(ResPacket(c.second))); 00399 tmp = padd(ResPacket(c.first),tmp); 00400 } 00401 else if((!ConjLhs)&&(ConjRhs)) 00402 { 00403 tmp = pconj(pcplxflip(ResPacket(c.second))); 00404 tmp = padd(ResPacket(c.first),tmp); 00405 } 00406 else if((ConjLhs)&&(!ConjRhs)) 00407 { 00408 tmp = pcplxflip(ResPacket(c.second)); 00409 tmp = padd(pconj(ResPacket(c.first)),tmp); 00410 } 00411 else if((ConjLhs)&&(ConjRhs)) 00412 { 00413 tmp = pcplxflip(ResPacket(c.second)); 00414 tmp = psub(pconj(ResPacket(c.first)),tmp); 00415 } 00416 00417 r = pmadd(tmp,alpha,r); 00418 } 00419 00420 protected: 00421 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; 00422 }; 00423 00424 template<typename RealScalar, bool _ConjRhs> 00425 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > 00426 { 00427 public: 00428 typedef std::complex<RealScalar> Scalar; 00429 typedef RealScalar LhsScalar; 00430 typedef Scalar RhsScalar; 00431 typedef Scalar ResScalar; 00432 00433 enum { 00434 ConjLhs = false, 00435 ConjRhs = _ConjRhs, 00436 Vectorizable = packet_traits<RealScalar>::Vectorizable 00437 && packet_traits<Scalar>::Vectorizable, 00438 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00439 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00440 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00441 00442 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 00443 nr = 4, 00444 mr = 2*ResPacketSize, 00445 WorkSpaceFactor = nr*RhsPacketSize, 00446 00447 LhsProgress = ResPacketSize, 00448 RhsProgress = ResPacketSize 00449 }; 00450 00451 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00452 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00453 typedef typename packet_traits<ResScalar>::type _ResPacket; 00454 00455 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00456 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00457 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00458 00459 typedef ResPacket AccPacket; 00460 00461 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 00462 { 00463 p = pset1<ResPacket>(ResScalar(0)); 00464 } 00465 00466 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 00467 { 00468 for(DenseIndex k=0; k<n; k++) 00469 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 00470 } 00471 00472 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 00473 { 00474 dest = pload<RhsPacket>(b); 00475 } 00476 00477 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00478 { 00479 dest = ploaddup<LhsPacket>(a); 00480 } 00481 00482 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const 00483 { 00484 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); 00485 } 00486 00487 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const 00488 { 00489 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); 00490 } 00491 00492 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const 00493 { 00494 c += a * b; 00495 } 00496 00497 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 00498 { 00499 r = cj.pmadd(alpha,c,r); 00500 } 00501 00502 protected: 00503 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; 00504 }; 00505 00506 /* optimized GEneral packed Block * packed Panel product kernel 00507 * 00508 * Mixing type logic: C += A * B 00509 * | A | B | comments 00510 * |real |cplx | no vectorization yet, would require to pack A with duplication 00511 * |cplx |real | easy vectorization 00512 */ 00513 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> 00514 struct gebp_kernel 00515 { 00516 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; 00517 typedef typename Traits::ResScalar ResScalar; 00518 typedef typename Traits::LhsPacket LhsPacket; 00519 typedef typename Traits::RhsPacket RhsPacket; 00520 typedef typename Traits::ResPacket ResPacket; 00521 typedef typename Traits::AccPacket AccPacket; 00522 00523 enum { 00524 Vectorizable = Traits::Vectorizable, 00525 LhsProgress = Traits::LhsProgress, 00526 RhsProgress = Traits::RhsProgress, 00527 ResPacketSize = Traits::ResPacketSize 00528 }; 00529 00530 EIGEN_DONT_INLINE 00531 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, 00532 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); 00533 }; 00534 00535 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> 00536 EIGEN_DONT_INLINE 00537 void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> 00538 ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, 00539 Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) 00540 { 00541 Traits traits; 00542 00543 if(strideA==-1) strideA = depth; 00544 if(strideB==-1) strideB = depth; 00545 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 00546 // conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 00547 Index packet_cols = (cols/nr) * nr; 00548 const Index peeled_mc = (rows/mr)*mr; 00549 // FIXME: 00550 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); 00551 const Index peeled_kc = (depth/4)*4; 00552 00553 if(unpackedB==0) 00554 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); 00555 00556 // loops on each micro vertical panel of rhs (depth x nr) 00557 for(Index j2=0; j2<packet_cols; j2+=nr) 00558 { 00559 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 00560 00561 // loops on each largest micro horizontal panel of lhs (mr x depth) 00562 // => we select a mr x nr micro block of res which is entirely 00563 // stored into mr/packet_size x nr registers. 00564 for(Index i=0; i<peeled_mc; i+=mr) 00565 { 00566 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; 00567 prefetch(&blA[0]); 00568 00569 // gets res block as register 00570 AccPacket C0, C1, C2, C3, C4, C5, C6, C7; 00571 traits.initAcc(C0); 00572 traits.initAcc(C1); 00573 if(nr==4) traits.initAcc(C2); 00574 if(nr==4) traits.initAcc(C3); 00575 traits.initAcc(C4); 00576 traits.initAcc(C5); 00577 if(nr==4) traits.initAcc(C6); 00578 if(nr==4) traits.initAcc(C7); 00579 00580 ResScalar* r0 = &res[(j2+0)*resStride + i]; 00581 ResScalar* r1 = r0 + resStride; 00582 ResScalar* r2 = r1 + resStride; 00583 ResScalar* r3 = r2 + resStride; 00584 00585 prefetch(r0+16); 00586 prefetch(r1+16); 00587 prefetch(r2+16); 00588 prefetch(r3+16); 00589 00590 // performs "inner" product 00591 // TODO let's check wether the folowing peeled loop could not be 00592 // optimized via optimal prefetching from one loop to the other 00593 const RhsScalar* blB = unpackedB; 00594 for(Index k=0; k<peeled_kc; k+=4) 00595 { 00596 if(nr==2) 00597 { 00598 LhsPacket A0, A1; 00599 RhsPacket B_0; 00600 RhsPacket T0; 00601 00602 EIGEN_ASM_COMMENT("mybegin2"); 00603 traits.loadLhs(&blA[0*LhsProgress], A0); 00604 traits.loadLhs(&blA[1*LhsProgress], A1); 00605 traits.loadRhs(&blB[0*RhsProgress], B_0); 00606 traits.madd(A0,B_0,C0,T0); 00607 traits.madd(A1,B_0,C4,B_0); 00608 traits.loadRhs(&blB[1*RhsProgress], B_0); 00609 traits.madd(A0,B_0,C1,T0); 00610 traits.madd(A1,B_0,C5,B_0); 00611 00612 traits.loadLhs(&blA[2*LhsProgress], A0); 00613 traits.loadLhs(&blA[3*LhsProgress], A1); 00614 traits.loadRhs(&blB[2*RhsProgress], B_0); 00615 traits.madd(A0,B_0,C0,T0); 00616 traits.madd(A1,B_0,C4,B_0); 00617 traits.loadRhs(&blB[3*RhsProgress], B_0); 00618 traits.madd(A0,B_0,C1,T0); 00619 traits.madd(A1,B_0,C5,B_0); 00620 00621 traits.loadLhs(&blA[4*LhsProgress], A0); 00622 traits.loadLhs(&blA[5*LhsProgress], A1); 00623 traits.loadRhs(&blB[4*RhsProgress], B_0); 00624 traits.madd(A0,B_0,C0,T0); 00625 traits.madd(A1,B_0,C4,B_0); 00626 traits.loadRhs(&blB[5*RhsProgress], B_0); 00627 traits.madd(A0,B_0,C1,T0); 00628 traits.madd(A1,B_0,C5,B_0); 00629 00630 traits.loadLhs(&blA[6*LhsProgress], A0); 00631 traits.loadLhs(&blA[7*LhsProgress], A1); 00632 traits.loadRhs(&blB[6*RhsProgress], B_0); 00633 traits.madd(A0,B_0,C0,T0); 00634 traits.madd(A1,B_0,C4,B_0); 00635 traits.loadRhs(&blB[7*RhsProgress], B_0); 00636 traits.madd(A0,B_0,C1,T0); 00637 traits.madd(A1,B_0,C5,B_0); 00638 EIGEN_ASM_COMMENT("myend"); 00639 } 00640 else 00641 { 00642 EIGEN_ASM_COMMENT("mybegin4"); 00643 LhsPacket A0, A1; 00644 RhsPacket B_0, B1, B2, B3; 00645 RhsPacket T0; 00646 00647 traits.loadLhs(&blA[0*LhsProgress], A0); 00648 traits.loadLhs(&blA[1*LhsProgress], A1); 00649 traits.loadRhs(&blB[0*RhsProgress], B_0); 00650 traits.loadRhs(&blB[1*RhsProgress], B1); 00651 00652 traits.madd(A0,B_0,C0,T0); 00653 traits.loadRhs(&blB[2*RhsProgress], B2); 00654 traits.madd(A1,B_0,C4,B_0); 00655 traits.loadRhs(&blB[3*RhsProgress], B3); 00656 traits.loadRhs(&blB[4*RhsProgress], B_0); 00657 traits.madd(A0,B1,C1,T0); 00658 traits.madd(A1,B1,C5,B1); 00659 traits.loadRhs(&blB[5*RhsProgress], B1); 00660 traits.madd(A0,B2,C2,T0); 00661 traits.madd(A1,B2,C6,B2); 00662 traits.loadRhs(&blB[6*RhsProgress], B2); 00663 traits.madd(A0,B3,C3,T0); 00664 traits.loadLhs(&blA[2*LhsProgress], A0); 00665 traits.madd(A1,B3,C7,B3); 00666 traits.loadLhs(&blA[3*LhsProgress], A1); 00667 traits.loadRhs(&blB[7*RhsProgress], B3); 00668 traits.madd(A0,B_0,C0,T0); 00669 traits.madd(A1,B_0,C4,B_0); 00670 traits.loadRhs(&blB[8*RhsProgress], B_0); 00671 traits.madd(A0,B1,C1,T0); 00672 traits.madd(A1,B1,C5,B1); 00673 traits.loadRhs(&blB[9*RhsProgress], B1); 00674 traits.madd(A0,B2,C2,T0); 00675 traits.madd(A1,B2,C6,B2); 00676 traits.loadRhs(&blB[10*RhsProgress], B2); 00677 traits.madd(A0,B3,C3,T0); 00678 traits.loadLhs(&blA[4*LhsProgress], A0); 00679 traits.madd(A1,B3,C7,B3); 00680 traits.loadLhs(&blA[5*LhsProgress], A1); 00681 traits.loadRhs(&blB[11*RhsProgress], B3); 00682 00683 traits.madd(A0,B_0,C0,T0); 00684 traits.madd(A1,B_0,C4,B_0); 00685 traits.loadRhs(&blB[12*RhsProgress], B_0); 00686 traits.madd(A0,B1,C1,T0); 00687 traits.madd(A1,B1,C5,B1); 00688 traits.loadRhs(&blB[13*RhsProgress], B1); 00689 traits.madd(A0,B2,C2,T0); 00690 traits.madd(A1,B2,C6,B2); 00691 traits.loadRhs(&blB[14*RhsProgress], B2); 00692 traits.madd(A0,B3,C3,T0); 00693 traits.loadLhs(&blA[6*LhsProgress], A0); 00694 traits.madd(A1,B3,C7,B3); 00695 traits.loadLhs(&blA[7*LhsProgress], A1); 00696 traits.loadRhs(&blB[15*RhsProgress], B3); 00697 traits.madd(A0,B_0,C0,T0); 00698 traits.madd(A1,B_0,C4,B_0); 00699 traits.madd(A0,B1,C1,T0); 00700 traits.madd(A1,B1,C5,B1); 00701 traits.madd(A0,B2,C2,T0); 00702 traits.madd(A1,B2,C6,B2); 00703 traits.madd(A0,B3,C3,T0); 00704 traits.madd(A1,B3,C7,B3); 00705 } 00706 00707 blB += 4*nr*RhsProgress; 00708 blA += 4*mr; 00709 } 00710 // process remaining peeled loop 00711 for(Index k=peeled_kc; k<depth; k++) 00712 { 00713 if(nr==2) 00714 { 00715 LhsPacket A0, A1; 00716 RhsPacket B_0; 00717 RhsPacket T0; 00718 00719 traits.loadLhs(&blA[0*LhsProgress], A0); 00720 traits.loadLhs(&blA[1*LhsProgress], A1); 00721 traits.loadRhs(&blB[0*RhsProgress], B_0); 00722 traits.madd(A0,B_0,C0,T0); 00723 traits.madd(A1,B_0,C4,B_0); 00724 traits.loadRhs(&blB[1*RhsProgress], B_0); 00725 traits.madd(A0,B_0,C1,T0); 00726 traits.madd(A1,B_0,C5,B_0); 00727 } 00728 else 00729 { 00730 LhsPacket A0, A1; 00731 RhsPacket B_0, B1, B2, B3; 00732 RhsPacket T0; 00733 00734 traits.loadLhs(&blA[0*LhsProgress], A0); 00735 traits.loadLhs(&blA[1*LhsProgress], A1); 00736 traits.loadRhs(&blB[0*RhsProgress], B_0); 00737 traits.loadRhs(&blB[1*RhsProgress], B1); 00738 00739 traits.madd(A0,B_0,C0,T0); 00740 traits.loadRhs(&blB[2*RhsProgress], B2); 00741 traits.madd(A1,B_0,C4,B_0); 00742 traits.loadRhs(&blB[3*RhsProgress], B3); 00743 traits.madd(A0,B1,C1,T0); 00744 traits.madd(A1,B1,C5,B1); 00745 traits.madd(A0,B2,C2,T0); 00746 traits.madd(A1,B2,C6,B2); 00747 traits.madd(A0,B3,C3,T0); 00748 traits.madd(A1,B3,C7,B3); 00749 } 00750 00751 blB += nr*RhsProgress; 00752 blA += mr; 00753 } 00754 00755 if(nr==4) 00756 { 00757 ResPacket R0, R1, R2, R3, R4, R5, R6; 00758 ResPacket alphav = pset1<ResPacket>(alpha); 00759 00760 R0 = ploadu<ResPacket>(r0); 00761 R1 = ploadu<ResPacket>(r1); 00762 R2 = ploadu<ResPacket>(r2); 00763 R3 = ploadu<ResPacket>(r3); 00764 R4 = ploadu<ResPacket>(r0 + ResPacketSize); 00765 R5 = ploadu<ResPacket>(r1 + ResPacketSize); 00766 R6 = ploadu<ResPacket>(r2 + ResPacketSize); 00767 traits.acc(C0, alphav, R0); 00768 pstoreu(r0, R0); 00769 R0 = ploadu<ResPacket>(r3 + ResPacketSize); 00770 00771 traits.acc(C1, alphav, R1); 00772 traits.acc(C2, alphav, R2); 00773 traits.acc(C3, alphav, R3); 00774 traits.acc(C4, alphav, R4); 00775 traits.acc(C5, alphav, R5); 00776 traits.acc(C6, alphav, R6); 00777 traits.acc(C7, alphav, R0); 00778 00779 pstoreu(r1, R1); 00780 pstoreu(r2, R2); 00781 pstoreu(r3, R3); 00782 pstoreu(r0 + ResPacketSize, R4); 00783 pstoreu(r1 + ResPacketSize, R5); 00784 pstoreu(r2 + ResPacketSize, R6); 00785 pstoreu(r3 + ResPacketSize, R0); 00786 } 00787 else 00788 { 00789 ResPacket R0, R1, R4; 00790 ResPacket alphav = pset1<ResPacket>(alpha); 00791 00792 R0 = ploadu<ResPacket>(r0); 00793 R1 = ploadu<ResPacket>(r1); 00794 R4 = ploadu<ResPacket>(r0 + ResPacketSize); 00795 traits.acc(C0, alphav, R0); 00796 pstoreu(r0, R0); 00797 R0 = ploadu<ResPacket>(r1 + ResPacketSize); 00798 traits.acc(C1, alphav, R1); 00799 traits.acc(C4, alphav, R4); 00800 traits.acc(C5, alphav, R0); 00801 pstoreu(r1, R1); 00802 pstoreu(r0 + ResPacketSize, R4); 00803 pstoreu(r1 + ResPacketSize, R0); 00804 } 00805 00806 } 00807 00808 if(rows-peeled_mc>=LhsProgress) 00809 { 00810 Index i = peeled_mc; 00811 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; 00812 prefetch(&blA[0]); 00813 00814 // gets res block as register 00815 AccPacket C0, C1, C2, C3; 00816 traits.initAcc(C0); 00817 traits.initAcc(C1); 00818 if(nr==4) traits.initAcc(C2); 00819 if(nr==4) traits.initAcc(C3); 00820 00821 // performs "inner" product 00822 const RhsScalar* blB = unpackedB; 00823 for(Index k=0; k<peeled_kc; k+=4) 00824 { 00825 if(nr==2) 00826 { 00827 LhsPacket A0; 00828 RhsPacket B_0, B1; 00829 00830 traits.loadLhs(&blA[0*LhsProgress], A0); 00831 traits.loadRhs(&blB[0*RhsProgress], B_0); 00832 traits.loadRhs(&blB[1*RhsProgress], B1); 00833 traits.madd(A0,B_0,C0,B_0); 00834 traits.loadRhs(&blB[2*RhsProgress], B_0); 00835 traits.madd(A0,B1,C1,B1); 00836 traits.loadLhs(&blA[1*LhsProgress], A0); 00837 traits.loadRhs(&blB[3*RhsProgress], B1); 00838 traits.madd(A0,B_0,C0,B_0); 00839 traits.loadRhs(&blB[4*RhsProgress], B_0); 00840 traits.madd(A0,B1,C1,B1); 00841 traits.loadLhs(&blA[2*LhsProgress], A0); 00842 traits.loadRhs(&blB[5*RhsProgress], B1); 00843 traits.madd(A0,B_0,C0,B_0); 00844 traits.loadRhs(&blB[6*RhsProgress], B_0); 00845 traits.madd(A0,B1,C1,B1); 00846 traits.loadLhs(&blA[3*LhsProgress], A0); 00847 traits.loadRhs(&blB[7*RhsProgress], B1); 00848 traits.madd(A0,B_0,C0,B_0); 00849 traits.madd(A0,B1,C1,B1); 00850 } 00851 else 00852 { 00853 LhsPacket A0; 00854 RhsPacket B_0, B1, B2, B3; 00855 00856 traits.loadLhs(&blA[0*LhsProgress], A0); 00857 traits.loadRhs(&blB[0*RhsProgress], B_0); 00858 traits.loadRhs(&blB[1*RhsProgress], B1); 00859 00860 traits.madd(A0,B_0,C0,B_0); 00861 traits.loadRhs(&blB[2*RhsProgress], B2); 00862 traits.loadRhs(&blB[3*RhsProgress], B3); 00863 traits.loadRhs(&blB[4*RhsProgress], B_0); 00864 traits.madd(A0,B1,C1,B1); 00865 traits.loadRhs(&blB[5*RhsProgress], B1); 00866 traits.madd(A0,B2,C2,B2); 00867 traits.loadRhs(&blB[6*RhsProgress], B2); 00868 traits.madd(A0,B3,C3,B3); 00869 traits.loadLhs(&blA[1*LhsProgress], A0); 00870 traits.loadRhs(&blB[7*RhsProgress], B3); 00871 traits.madd(A0,B_0,C0,B_0); 00872 traits.loadRhs(&blB[8*RhsProgress], B_0); 00873 traits.madd(A0,B1,C1,B1); 00874 traits.loadRhs(&blB[9*RhsProgress], B1); 00875 traits.madd(A0,B2,C2,B2); 00876 traits.loadRhs(&blB[10*RhsProgress], B2); 00877 traits.madd(A0,B3,C3,B3); 00878 traits.loadLhs(&blA[2*LhsProgress], A0); 00879 traits.loadRhs(&blB[11*RhsProgress], B3); 00880 00881 traits.madd(A0,B_0,C0,B_0); 00882 traits.loadRhs(&blB[12*RhsProgress], B_0); 00883 traits.madd(A0,B1,C1,B1); 00884 traits.loadRhs(&blB[13*RhsProgress], B1); 00885 traits.madd(A0,B2,C2,B2); 00886 traits.loadRhs(&blB[14*RhsProgress], B2); 00887 traits.madd(A0,B3,C3,B3); 00888 00889 traits.loadLhs(&blA[3*LhsProgress], A0); 00890 traits.loadRhs(&blB[15*RhsProgress], B3); 00891 traits.madd(A0,B_0,C0,B_0); 00892 traits.madd(A0,B1,C1,B1); 00893 traits.madd(A0,B2,C2,B2); 00894 traits.madd(A0,B3,C3,B3); 00895 } 00896 00897 blB += nr*4*RhsProgress; 00898 blA += 4*LhsProgress; 00899 } 00900 // process remaining peeled loop 00901 for(Index k=peeled_kc; k<depth; k++) 00902 { 00903 if(nr==2) 00904 { 00905 LhsPacket A0; 00906 RhsPacket B_0, B1; 00907 00908 traits.loadLhs(&blA[0*LhsProgress], A0); 00909 traits.loadRhs(&blB[0*RhsProgress], B_0); 00910 traits.loadRhs(&blB[1*RhsProgress], B1); 00911 traits.madd(A0,B_0,C0,B_0); 00912 traits.madd(A0,B1,C1,B1); 00913 } 00914 else 00915 { 00916 LhsPacket A0; 00917 RhsPacket B_0, B1, B2, B3; 00918 00919 traits.loadLhs(&blA[0*LhsProgress], A0); 00920 traits.loadRhs(&blB[0*RhsProgress], B_0); 00921 traits.loadRhs(&blB[1*RhsProgress], B1); 00922 traits.loadRhs(&blB[2*RhsProgress], B2); 00923 traits.loadRhs(&blB[3*RhsProgress], B3); 00924 00925 traits.madd(A0,B_0,C0,B_0); 00926 traits.madd(A0,B1,C1,B1); 00927 traits.madd(A0,B2,C2,B2); 00928 traits.madd(A0,B3,C3,B3); 00929 } 00930 00931 blB += nr*RhsProgress; 00932 blA += LhsProgress; 00933 } 00934 00935 ResPacket R0, R1, R2, R3; 00936 ResPacket alphav = pset1<ResPacket>(alpha); 00937 00938 ResScalar* r0 = &res[(j2+0)*resStride + i]; 00939 ResScalar* r1 = r0 + resStride; 00940 ResScalar* r2 = r1 + resStride; 00941 ResScalar* r3 = r2 + resStride; 00942 00943 R0 = ploadu<ResPacket>(r0); 00944 R1 = ploadu<ResPacket>(r1); 00945 if(nr==4) R2 = ploadu<ResPacket>(r2); 00946 if(nr==4) R3 = ploadu<ResPacket>(r3); 00947 00948 traits.acc(C0, alphav, R0); 00949 traits.acc(C1, alphav, R1); 00950 if(nr==4) traits.acc(C2, alphav, R2); 00951 if(nr==4) traits.acc(C3, alphav, R3); 00952 00953 pstoreu(r0, R0); 00954 pstoreu(r1, R1); 00955 if(nr==4) pstoreu(r2, R2); 00956 if(nr==4) pstoreu(r3, R3); 00957 } 00958 for(Index i=peeled_mc2; i<rows; i++) 00959 { 00960 const LhsScalar* blA = &blockA[i*strideA+offsetA]; 00961 prefetch(&blA[0]); 00962 00963 // gets a 1 x nr res block as registers 00964 ResScalar C0(0), C1(0), C2(0), C3(0); 00965 // TODO directly use blockB ??? 00966 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; 00967 for(Index k=0; k<depth; k++) 00968 { 00969 if(nr==2) 00970 { 00971 LhsScalar A0; 00972 RhsScalar B_0, B1; 00973 00974 A0 = blA[k]; 00975 B_0 = blB[0]; 00976 B1 = blB[1]; 00977 MADD(cj,A0,B_0,C0,B_0); 00978 MADD(cj,A0,B1,C1,B1); 00979 } 00980 else 00981 { 00982 LhsScalar A0; 00983 RhsScalar B_0, B1, B2, B3; 00984 00985 A0 = blA[k]; 00986 B_0 = blB[0]; 00987 B1 = blB[1]; 00988 B2 = blB[2]; 00989 B3 = blB[3]; 00990 00991 MADD(cj,A0,B_0,C0,B_0); 00992 MADD(cj,A0,B1,C1,B1); 00993 MADD(cj,A0,B2,C2,B2); 00994 MADD(cj,A0,B3,C3,B3); 00995 } 00996 00997 blB += nr; 00998 } 00999 res[(j2+0)*resStride + i] += alpha*C0; 01000 res[(j2+1)*resStride + i] += alpha*C1; 01001 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; 01002 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; 01003 } 01004 } 01005 // process remaining rhs/res columns one at a time 01006 // => do the same but with nr==1 01007 for(Index j2=packet_cols; j2<cols; j2++) 01008 { 01009 // unpack B 01010 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); 01011 01012 for(Index i=0; i<peeled_mc; i+=mr) 01013 { 01014 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; 01015 prefetch(&blA[0]); 01016 01017 // TODO move the res loads to the stores 01018 01019 // get res block as registers 01020 AccPacket C0, C4; 01021 traits.initAcc(C0); 01022 traits.initAcc(C4); 01023 01024 const RhsScalar* blB = unpackedB; 01025 for(Index k=0; k<depth; k++) 01026 { 01027 LhsPacket A0, A1; 01028 RhsPacket B_0; 01029 RhsPacket T0; 01030 01031 traits.loadLhs(&blA[0*LhsProgress], A0); 01032 traits.loadLhs(&blA[1*LhsProgress], A1); 01033 traits.loadRhs(&blB[0*RhsProgress], B_0); 01034 traits.madd(A0,B_0,C0,T0); 01035 traits.madd(A1,B_0,C4,B_0); 01036 01037 blB += RhsProgress; 01038 blA += 2*LhsProgress; 01039 } 01040 ResPacket R0, R4; 01041 ResPacket alphav = pset1<ResPacket>(alpha); 01042 01043 ResScalar* r0 = &res[(j2+0)*resStride + i]; 01044 01045 R0 = ploadu<ResPacket>(r0); 01046 R4 = ploadu<ResPacket>(r0+ResPacketSize); 01047 01048 traits.acc(C0, alphav, R0); 01049 traits.acc(C4, alphav, R4); 01050 01051 pstoreu(r0, R0); 01052 pstoreu(r0+ResPacketSize, R4); 01053 } 01054 if(rows-peeled_mc>=LhsProgress) 01055 { 01056 Index i = peeled_mc; 01057 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; 01058 prefetch(&blA[0]); 01059 01060 AccPacket C0; 01061 traits.initAcc(C0); 01062 01063 const RhsScalar* blB = unpackedB; 01064 for(Index k=0; k<depth; k++) 01065 { 01066 LhsPacket A0; 01067 RhsPacket B_0; 01068 traits.loadLhs(blA, A0); 01069 traits.loadRhs(blB, B_0); 01070 traits.madd(A0, B_0, C0, B_0); 01071 blB += RhsProgress; 01072 blA += LhsProgress; 01073 } 01074 01075 ResPacket alphav = pset1<ResPacket>(alpha); 01076 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); 01077 traits.acc(C0, alphav, R0); 01078 pstoreu(&res[(j2+0)*resStride + i], R0); 01079 } 01080 for(Index i=peeled_mc2; i<rows; i++) 01081 { 01082 const LhsScalar* blA = &blockA[i*strideA+offsetA]; 01083 prefetch(&blA[0]); 01084 01085 // gets a 1 x 1 res block as registers 01086 ResScalar C0(0); 01087 // FIXME directly use blockB ?? 01088 const RhsScalar* blB = &blockB[j2*strideB+offsetB]; 01089 for(Index k=0; k<depth; k++) 01090 { 01091 LhsScalar A0 = blA[k]; 01092 RhsScalar B_0 = blB[k]; 01093 MADD(cj, A0, B_0, C0, B_0); 01094 } 01095 res[(j2+0)*resStride + i] += alpha*C0; 01096 } 01097 } 01098 } 01099 01100 01101 #undef CJMADD 01102 01103 // pack a block of the lhs 01104 // The traversal is as follow (mr==4): 01105 // 0 4 8 12 ... 01106 // 1 5 9 13 ... 01107 // 2 6 10 14 ... 01108 // 3 7 11 15 ... 01109 // 01110 // 16 20 24 28 ... 01111 // 17 21 25 29 ... 01112 // 18 22 26 30 ... 01113 // 19 23 27 31 ... 01114 // 01115 // 32 33 34 35 ... 01116 // 36 36 38 39 ... 01117 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> 01118 struct gemm_pack_lhs 01119 { 01120 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); 01121 }; 01122 01123 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> 01124 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode> 01125 ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) 01126 { 01127 typedef typename packet_traits<Scalar>::type Packet; 01128 enum { PacketSize = packet_traits<Scalar>::size }; 01129 01130 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); 01131 EIGEN_UNUSED_VARIABLE(stride) 01132 EIGEN_UNUSED_VARIABLE(offset) 01133 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 01134 eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); 01135 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 01136 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); 01137 Index count = 0; 01138 Index peeled_mc = (rows/Pack1)*Pack1; 01139 for(Index i=0; i<peeled_mc; i+=Pack1) 01140 { 01141 if(PanelMode) count += Pack1 * offset; 01142 01143 if(StorageOrder==ColMajor) 01144 { 01145 for(Index k=0; k<depth; k++) 01146 { 01147 Packet A, B, C, D; 01148 if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); 01149 if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); 01150 if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); 01151 if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); 01152 if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } 01153 if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } 01154 if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } 01155 if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } 01156 } 01157 } 01158 else 01159 { 01160 for(Index k=0; k<depth; k++) 01161 { 01162 // TODO add a vectorized transpose here 01163 Index w=0; 01164 for(; w<Pack1-3; w+=4) 01165 { 01166 Scalar a(cj(lhs(i+w+0, k))), 01167 b(cj(lhs(i+w+1, k))), 01168 c(cj(lhs(i+w+2, k))), 01169 d(cj(lhs(i+w+3, k))); 01170 blockA[count++] = a; 01171 blockA[count++] = b; 01172 blockA[count++] = c; 01173 blockA[count++] = d; 01174 } 01175 if(Pack1%4) 01176 for(;w<Pack1;++w) 01177 blockA[count++] = cj(lhs(i+w, k)); 01178 } 01179 } 01180 if(PanelMode) count += Pack1 * (stride-offset-depth); 01181 } 01182 if(rows-peeled_mc>=Pack2) 01183 { 01184 if(PanelMode) count += Pack2*offset; 01185 for(Index k=0; k<depth; k++) 01186 for(Index w=0; w<Pack2; w++) 01187 blockA[count++] = cj(lhs(peeled_mc+w, k)); 01188 if(PanelMode) count += Pack2 * (stride-offset-depth); 01189 peeled_mc += Pack2; 01190 } 01191 for(Index i=peeled_mc; i<rows; i++) 01192 { 01193 if(PanelMode) count += offset; 01194 for(Index k=0; k<depth; k++) 01195 blockA[count++] = cj(lhs(i, k)); 01196 if(PanelMode) count += (stride-offset-depth); 01197 } 01198 } 01199 01200 // copy a complete panel of the rhs 01201 // this version is optimized for column major matrices 01202 // The traversal order is as follow: (nr==4): 01203 // 0 1 2 3 12 13 14 15 24 27 01204 // 4 5 6 7 16 17 18 19 25 28 01205 // 8 9 10 11 20 21 22 23 26 29 01206 // . . . . . . . . . . 01207 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01208 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> 01209 { 01210 typedef typename packet_traits<Scalar>::type Packet; 01211 enum { PacketSize = packet_traits<Scalar>::size }; 01212 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); 01213 }; 01214 01215 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01216 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> 01217 ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) 01218 { 01219 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); 01220 EIGEN_UNUSED_VARIABLE(stride) 01221 EIGEN_UNUSED_VARIABLE(offset) 01222 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 01223 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 01224 Index packet_cols = (cols/nr) * nr; 01225 Index count = 0; 01226 for(Index j2=0; j2<packet_cols; j2+=nr) 01227 { 01228 // skip what we have before 01229 if(PanelMode) count += nr * offset; 01230 const Scalar* b0 = &rhs[(j2+0)*rhsStride]; 01231 const Scalar* b1 = &rhs[(j2+1)*rhsStride]; 01232 const Scalar* b2 = &rhs[(j2+2)*rhsStride]; 01233 const Scalar* b3 = &rhs[(j2+3)*rhsStride]; 01234 for(Index k=0; k<depth; k++) 01235 { 01236 blockB[count+0] = cj(b0[k]); 01237 blockB[count+1] = cj(b1[k]); 01238 if(nr==4) blockB[count+2] = cj(b2[k]); 01239 if(nr==4) blockB[count+3] = cj(b3[k]); 01240 count += nr; 01241 } 01242 // skip what we have after 01243 if(PanelMode) count += nr * (stride-offset-depth); 01244 } 01245 01246 // copy the remaining columns one at a time (nr==1) 01247 for(Index j2=packet_cols; j2<cols; ++j2) 01248 { 01249 if(PanelMode) count += offset; 01250 const Scalar* b0 = &rhs[(j2+0)*rhsStride]; 01251 for(Index k=0; k<depth; k++) 01252 { 01253 blockB[count] = cj(b0[k]); 01254 count += 1; 01255 } 01256 if(PanelMode) count += (stride-offset-depth); 01257 } 01258 } 01259 01260 // this version is optimized for row major matrices 01261 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01262 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> 01263 { 01264 enum { PacketSize = packet_traits<Scalar>::size }; 01265 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); 01266 }; 01267 01268 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01269 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> 01270 ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) 01271 { 01272 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); 01273 EIGEN_UNUSED_VARIABLE(stride) 01274 EIGEN_UNUSED_VARIABLE(offset) 01275 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 01276 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 01277 Index packet_cols = (cols/nr) * nr; 01278 Index count = 0; 01279 for(Index j2=0; j2<packet_cols; j2+=nr) 01280 { 01281 // skip what we have before 01282 if(PanelMode) count += nr * offset; 01283 for(Index k=0; k<depth; k++) 01284 { 01285 const Scalar* b0 = &rhs[k*rhsStride + j2]; 01286 blockB[count+0] = cj(b0[0]); 01287 blockB[count+1] = cj(b0[1]); 01288 if(nr==4) blockB[count+2] = cj(b0[2]); 01289 if(nr==4) blockB[count+3] = cj(b0[3]); 01290 count += nr; 01291 } 01292 // skip what we have after 01293 if(PanelMode) count += nr * (stride-offset-depth); 01294 } 01295 // copy the remaining columns one at a time (nr==1) 01296 for(Index j2=packet_cols; j2<cols; ++j2) 01297 { 01298 if(PanelMode) count += offset; 01299 const Scalar* b0 = &rhs[j2]; 01300 for(Index k=0; k<depth; k++) 01301 { 01302 blockB[count] = cj(b0[k*rhsStride]); 01303 count += 1; 01304 } 01305 if(PanelMode) count += stride-offset-depth; 01306 } 01307 } 01308 01309 } // end namespace internal 01310 01311 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. 01312 * \sa setCpuCacheSize */ 01313 inline std::ptrdiff_t l1CacheSize () 01314 { 01315 std::ptrdiff_t l1, l2; 01316 internal::manage_caching_sizes(GetAction, &l1, &l2); 01317 return l1; 01318 } 01319 01320 /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. 01321 * \sa setCpuCacheSize */ 01322 inline std::ptrdiff_t l2CacheSize () 01323 { 01324 std::ptrdiff_t l1, l2; 01325 internal::manage_caching_sizes(GetAction, &l1, &l2); 01326 return l2; 01327 } 01328 01329 /** Set the cpu L1 and L2 cache sizes (in bytes). 01330 * These values are use to adjust the size of the blocks 01331 * for the algorithms working per blocks. 01332 * 01333 * \sa computeProductBlockingSizes */ 01334 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) 01335 { 01336 internal::manage_caching_sizes(SetAction, &l1, &l2); 01337 } 01338 01339 } // end namespace Eigen 01340 01341 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Generated on Tue Jul 12 2022 17:46:53 by 1.7.2