Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
GeneralBlockPanelKernel.h
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 00011 #define EIGEN_GENERAL_BLOCK_PANEL_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> 00018 class gebp_traits; 00019 00020 00021 /** \internal \returns b if a<=0, and returns a otherwise. */ 00022 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) 00023 { 00024 return a<=0 ? b : a; 00025 } 00026 00027 /** \internal */ 00028 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) 00029 { 00030 static std::ptrdiff_t m_l1CacheSize = 0; 00031 static std::ptrdiff_t m_l2CacheSize = 0; 00032 if(m_l2CacheSize==0) 00033 { 00034 m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); 00035 m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); 00036 } 00037 00038 if(action==SetAction) 00039 { 00040 // set the cpu cache size and cache all block sizes from a global cache size in byte 00041 eigen_internal_assert(l1!=0 && l2!=0); 00042 m_l1CacheSize = *l1; 00043 m_l2CacheSize = *l2; 00044 } 00045 else if(action==GetAction) 00046 { 00047 eigen_internal_assert(l1!=0 && l2!=0); 00048 *l1 = m_l1CacheSize; 00049 *l2 = m_l2CacheSize; 00050 } 00051 else 00052 { 00053 eigen_internal_assert(false); 00054 } 00055 } 00056 00057 /** \brief Computes the blocking parameters for a m x k times k x n matrix product 00058 * 00059 * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. 00060 * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. 00061 * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. 00062 * 00063 * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, 00064 * this function computes the blocking size parameters along the respective dimensions 00065 * for matrix products and related algorithms. The blocking sizes depends on various 00066 * parameters: 00067 * - the L1 and L2 cache sizes, 00068 * - the register level blocking sizes defined by gebp_traits, 00069 * - the number of scalars that fit into a packet (when vectorization is enabled). 00070 * 00071 * \sa setCpuCacheSizes */ 00072 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> 00073 void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) 00074 { 00075 EIGEN_UNUSED_VARIABLE(n); 00076 // Explanations: 00077 // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and 00078 // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed 00079 // per kc x nr vertical small panels where nr is the blocking size along the n dimension 00080 // at the register level. For vectorization purpose, these small vertical panels are unpacked, 00081 // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to 00082 // stay in L1 cache. 00083 std::ptrdiff_t l1, l2; 00084 00085 typedef gebp_traits<LhsScalar,RhsScalar> Traits; 00086 enum { 00087 kdiv = KcFactor * 2 * Traits::nr 00088 * Traits::RhsProgress * sizeof(RhsScalar), 00089 mr = gebp_traits<LhsScalar,RhsScalar>::mr, 00090 mr_mask = (0xffffffff/mr)*mr 00091 }; 00092 00093 manage_caching_sizes(GetAction, &l1, &l2); 00094 k = std::min<SizeType>(k, l1/kdiv); 00095 SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; 00096 if(_m<m) m = _m & mr_mask; 00097 } 00098 00099 template<typename LhsScalar, typename RhsScalar, typename SizeType> 00100 inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) 00101 { 00102 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); 00103 } 00104 00105 #ifdef EIGEN_HAS_FUSE_CJMADD 00106 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 00107 #else 00108 00109 // FIXME (a bit overkill maybe ?) 00110 00111 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector { 00112 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) 00113 { 00114 c = cj.pmadd(a,b,c); 00115 } 00116 }; 00117 00118 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> { 00119 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) 00120 { 00121 t = b; t = cj.pmul(a,t); c = padd(c,t); 00122 } 00123 }; 00124 00125 template<typename CJ, typename A, typename B, typename C, typename T> 00126 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) 00127 { 00128 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); 00129 } 00130 00131 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 00132 // #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); 00133 #endif 00134 00135 /* Vectorization logic 00136 * real*real: unpack rhs to constant packets, ... 00137 * 00138 * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), 00139 * storing each res packet into two packets (2x2), 00140 * at the end combine them: swap the second and addsub them 00141 * cf*cf : same but with 2x4 blocks 00142 * cplx*real : unpack rhs to constant packets, ... 00143 * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual 00144 */ 00145 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> 00146 class gebp_traits 00147 { 00148 public: 00149 typedef _LhsScalar LhsScalar; 00150 typedef _RhsScalar RhsScalar; 00151 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 00152 00153 enum { 00154 ConjLhs = _ConjLhs, 00155 ConjRhs = _ConjRhs, 00156 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, 00157 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00158 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00159 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00160 00161 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 00162 00163 // register block size along the N direction (must be either 2 or 4) 00164 nr = NumberOfRegisters/4, 00165 00166 // register block size along the M direction (currently, this one cannot be modified) 00167 mr = 2 * LhsPacketSize, 00168 00169 WorkSpaceFactor = nr * RhsPacketSize, 00170 00171 LhsProgress = LhsPacketSize, 00172 RhsProgress = RhsPacketSize 00173 }; 00174 00175 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00176 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00177 typedef typename packet_traits<ResScalar>::type _ResPacket; 00178 00179 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00180 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00181 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00182 00183 typedef ResPacket AccPacket; 00184 00185 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 00186 { 00187 p = pset1<ResPacket>(ResScalar(0)); 00188 } 00189 00190 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 00191 { 00192 for(DenseIndex k=0; k<n; k++) 00193 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 00194 } 00195 00196 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 00197 { 00198 dest = pload<RhsPacket>(b); 00199 } 00200 00201 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00202 { 00203 dest = pload<LhsPacket>(a); 00204 } 00205 00206 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const 00207 { 00208 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); 00209 } 00210 00211 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 00212 { 00213 r = pmadd(c,alpha,r); 00214 } 00215 00216 protected: 00217 // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; 00218 // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; 00219 }; 00220 00221 template<typename RealScalar, bool _ConjLhs> 00222 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> 00223 { 00224 public: 00225 typedef std::complex<RealScalar> LhsScalar; 00226 typedef RealScalar RhsScalar; 00227 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 00228 00229 enum { 00230 ConjLhs = _ConjLhs, 00231 ConjRhs = false, 00232 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, 00233 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00234 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00235 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00236 00237 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 00238 nr = NumberOfRegisters/4, 00239 mr = 2 * LhsPacketSize, 00240 WorkSpaceFactor = nr*RhsPacketSize, 00241 00242 LhsProgress = LhsPacketSize, 00243 RhsProgress = RhsPacketSize 00244 }; 00245 00246 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00247 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00248 typedef typename packet_traits<ResScalar>::type _ResPacket; 00249 00250 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00251 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00252 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00253 00254 typedef ResPacket AccPacket; 00255 00256 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 00257 { 00258 p = pset1<ResPacket>(ResScalar(0)); 00259 } 00260 00261 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 00262 { 00263 for(DenseIndex k=0; k<n; k++) 00264 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 00265 } 00266 00267 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 00268 { 00269 dest = pload<RhsPacket>(b); 00270 } 00271 00272 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00273 { 00274 dest = pload<LhsPacket>(a); 00275 } 00276 00277 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const 00278 { 00279 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); 00280 } 00281 00282 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const 00283 { 00284 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); 00285 } 00286 00287 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const 00288 { 00289 c += a * b; 00290 } 00291 00292 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 00293 { 00294 r = cj.pmadd(c,alpha,r); 00295 } 00296 00297 protected: 00298 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; 00299 }; 00300 00301 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> 00302 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > 00303 { 00304 public: 00305 typedef std::complex<RealScalar> Scalar; 00306 typedef std::complex<RealScalar> LhsScalar; 00307 typedef std::complex<RealScalar> RhsScalar; 00308 typedef std::complex<RealScalar> ResScalar; 00309 00310 enum { 00311 ConjLhs = _ConjLhs, 00312 ConjRhs = _ConjRhs, 00313 Vectorizable = packet_traits<RealScalar>::Vectorizable 00314 && packet_traits<Scalar>::Vectorizable, 00315 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, 00316 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00317 00318 nr = 2, 00319 mr = 2 * ResPacketSize, 00320 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, 00321 00322 LhsProgress = ResPacketSize, 00323 RhsProgress = Vectorizable ? 2*ResPacketSize : 1 00324 }; 00325 00326 typedef typename packet_traits<RealScalar>::type RealPacket; 00327 typedef typename packet_traits<Scalar>::type ScalarPacket; 00328 struct DoublePacket 00329 { 00330 RealPacket first; 00331 RealPacket second; 00332 }; 00333 00334 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; 00335 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; 00336 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; 00337 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; 00338 00339 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } 00340 00341 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) 00342 { 00343 p.first = pset1<RealPacket>(RealScalar(0)); 00344 p.second = pset1<RealPacket>(RealScalar(0)); 00345 } 00346 00347 /* Unpack the rhs coeff such that each complex coefficient is spread into 00348 * two packects containing respectively the real and imaginary coefficient 00349 * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] 00350 */ 00351 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) 00352 { 00353 for(DenseIndex k=0; k<n; k++) 00354 { 00355 if(Vectorizable) 00356 { 00357 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); 00358 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); 00359 } 00360 else 00361 b[k] = rhs[k]; 00362 } 00363 } 00364 00365 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } 00366 00367 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const 00368 { 00369 dest.first = pload<RealPacket>((const RealScalar*)b); 00370 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); 00371 } 00372 00373 // nothing special here 00374 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00375 { 00376 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); 00377 } 00378 00379 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const 00380 { 00381 c.first = padd(pmul(a,b.first), c.first); 00382 c.second = padd(pmul(a,b.second),c.second); 00383 } 00384 00385 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const 00386 { 00387 c = cj.pmadd(a,b,c); 00388 } 00389 00390 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } 00391 00392 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const 00393 { 00394 // assemble c 00395 ResPacket tmp; 00396 if((!ConjLhs)&&(!ConjRhs)) 00397 { 00398 tmp = pcplxflip(pconj(ResPacket(c.second))); 00399 tmp = padd(ResPacket(c.first),tmp); 00400 } 00401 else if((!ConjLhs)&&(ConjRhs)) 00402 { 00403 tmp = pconj(pcplxflip(ResPacket(c.second))); 00404 tmp = padd(ResPacket(c.first),tmp); 00405 } 00406 else if((ConjLhs)&&(!ConjRhs)) 00407 { 00408 tmp = pcplxflip(ResPacket(c.second)); 00409 tmp = padd(pconj(ResPacket(c.first)),tmp); 00410 } 00411 else if((ConjLhs)&&(ConjRhs)) 00412 { 00413 tmp = pcplxflip(ResPacket(c.second)); 00414 tmp = psub(pconj(ResPacket(c.first)),tmp); 00415 } 00416 00417 r = pmadd(tmp,alpha,r); 00418 } 00419 00420 protected: 00421 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; 00422 }; 00423 00424 template<typename RealScalar, bool _ConjRhs> 00425 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > 00426 { 00427 public: 00428 typedef std::complex<RealScalar> Scalar; 00429 typedef RealScalar LhsScalar; 00430 typedef Scalar RhsScalar; 00431 typedef Scalar ResScalar; 00432 00433 enum { 00434 ConjLhs = false, 00435 ConjRhs = _ConjRhs, 00436 Vectorizable = packet_traits<RealScalar>::Vectorizable 00437 && packet_traits<Scalar>::Vectorizable, 00438 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 00439 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 00440 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 00441 00442 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 00443 nr = 4, 00444 mr = 2*ResPacketSize, 00445 WorkSpaceFactor = nr*RhsPacketSize, 00446 00447 LhsProgress = ResPacketSize, 00448 RhsProgress = ResPacketSize 00449 }; 00450 00451 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 00452 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 00453 typedef typename packet_traits<ResScalar>::type _ResPacket; 00454 00455 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 00456 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 00457 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 00458 00459 typedef ResPacket AccPacket; 00460 00461 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 00462 { 00463 p = pset1<ResPacket>(ResScalar(0)); 00464 } 00465 00466 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 00467 { 00468 for(DenseIndex k=0; k<n; k++) 00469 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 00470 } 00471 00472 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 00473 { 00474 dest = pload<RhsPacket>(b); 00475 } 00476 00477 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 00478 { 00479 dest = ploaddup<LhsPacket>(a); 00480 } 00481 00482 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const 00483 { 00484 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); 00485 } 00486 00487 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const 00488 { 00489 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); 00490 } 00491 00492 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const 00493 { 00494 c += a * b; 00495 } 00496 00497 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 00498 { 00499 r = cj.pmadd(alpha,c,r); 00500 } 00501 00502 protected: 00503 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; 00504 }; 00505 00506 /* optimized GEneral packed Block * packed Panel product kernel 00507 * 00508 * Mixing type logic: C += A * B 00509 * | A | B | comments 00510 * |real |cplx | no vectorization yet, would require to pack A with duplication 00511 * |cplx |real | easy vectorization 00512 */ 00513 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> 00514 struct gebp_kernel 00515 { 00516 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; 00517 typedef typename Traits::ResScalar ResScalar; 00518 typedef typename Traits::LhsPacket LhsPacket; 00519 typedef typename Traits::RhsPacket RhsPacket; 00520 typedef typename Traits::ResPacket ResPacket; 00521 typedef typename Traits::AccPacket AccPacket; 00522 00523 enum { 00524 Vectorizable = Traits::Vectorizable, 00525 LhsProgress = Traits::LhsProgress, 00526 RhsProgress = Traits::RhsProgress, 00527 ResPacketSize = Traits::ResPacketSize 00528 }; 00529 00530 EIGEN_DONT_INLINE 00531 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, 00532 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); 00533 }; 00534 00535 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> 00536 EIGEN_DONT_INLINE 00537 void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> 00538 ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, 00539 Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) 00540 { 00541 Traits traits; 00542 00543 if(strideA==-1) strideA = depth; 00544 if(strideB==-1) strideB = depth; 00545 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 00546 // conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 00547 Index packet_cols = (cols/nr) * nr; 00548 const Index peeled_mc = (rows/mr)*mr; 00549 // FIXME: 00550 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); 00551 const Index peeled_kc = (depth/4)*4; 00552 00553 if(unpackedB==0) 00554 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); 00555 00556 // loops on each micro vertical panel of rhs (depth x nr) 00557 for(Index j2=0; j2<packet_cols; j2+=nr) 00558 { 00559 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 00560 00561 // loops on each largest micro horizontal panel of lhs (mr x depth) 00562 // => we select a mr x nr micro block of res which is entirely 00563 // stored into mr/packet_size x nr registers. 00564 for(Index i=0; i<peeled_mc; i+=mr) 00565 { 00566 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; 00567 prefetch(&blA[0]); 00568 00569 // gets res block as register 00570 AccPacket C0, C1, C2, C3, C4, C5, C6, C7; 00571 traits.initAcc(C0); 00572 traits.initAcc(C1); 00573 if(nr==4) traits.initAcc(C2); 00574 if(nr==4) traits.initAcc(C3); 00575 traits.initAcc(C4); 00576 traits.initAcc(C5); 00577 if(nr==4) traits.initAcc(C6); 00578 if(nr==4) traits.initAcc(C7); 00579 00580 ResScalar* r0 = &res[(j2+0)*resStride + i]; 00581 ResScalar* r1 = r0 + resStride; 00582 ResScalar* r2 = r1 + resStride; 00583 ResScalar* r3 = r2 + resStride; 00584 00585 prefetch(r0+16); 00586 prefetch(r1+16); 00587 prefetch(r2+16); 00588 prefetch(r3+16); 00589 00590 // performs "inner" product 00591 // TODO let's check wether the folowing peeled loop could not be 00592 // optimized via optimal prefetching from one loop to the other 00593 const RhsScalar* blB = unpackedB; 00594 for(Index k=0; k<peeled_kc; k+=4) 00595 { 00596 if(nr==2) 00597 { 00598 LhsPacket A0, A1; 00599 RhsPacket B_0; 00600 RhsPacket T0; 00601 00602 EIGEN_ASM_COMMENT("mybegin2"); 00603 traits.loadLhs(&blA[0*LhsProgress], A0); 00604 traits.loadLhs(&blA[1*LhsProgress], A1); 00605 traits.loadRhs(&blB[0*RhsProgress], B_0); 00606 traits.madd(A0,B_0,C0,T0); 00607 traits.madd(A1,B_0,C4,B_0); 00608 traits.loadRhs(&blB[1*RhsProgress], B_0); 00609 traits.madd(A0,B_0,C1,T0); 00610 traits.madd(A1,B_0,C5,B_0); 00611 00612 traits.loadLhs(&blA[2*LhsProgress], A0); 00613 traits.loadLhs(&blA[3*LhsProgress], A1); 00614 traits.loadRhs(&blB[2*RhsProgress], B_0); 00615 traits.madd(A0,B_0,C0,T0); 00616 traits.madd(A1,B_0,C4,B_0); 00617 traits.loadRhs(&blB[3*RhsProgress], B_0); 00618 traits.madd(A0,B_0,C1,T0); 00619 traits.madd(A1,B_0,C5,B_0); 00620 00621 traits.loadLhs(&blA[4*LhsProgress], A0); 00622 traits.loadLhs(&blA[5*LhsProgress], A1); 00623 traits.loadRhs(&blB[4*RhsProgress], B_0); 00624 traits.madd(A0,B_0,C0,T0); 00625 traits.madd(A1,B_0,C4,B_0); 00626 traits.loadRhs(&blB[5*RhsProgress], B_0); 00627 traits.madd(A0,B_0,C1,T0); 00628 traits.madd(A1,B_0,C5,B_0); 00629 00630 traits.loadLhs(&blA[6*LhsProgress], A0); 00631 traits.loadLhs(&blA[7*LhsProgress], A1); 00632 traits.loadRhs(&blB[6*RhsProgress], B_0); 00633 traits.madd(A0,B_0,C0,T0); 00634 traits.madd(A1,B_0,C4,B_0); 00635 traits.loadRhs(&blB[7*RhsProgress], B_0); 00636 traits.madd(A0,B_0,C1,T0); 00637 traits.madd(A1,B_0,C5,B_0); 00638 EIGEN_ASM_COMMENT("myend"); 00639 } 00640 else 00641 { 00642 EIGEN_ASM_COMMENT("mybegin4"); 00643 LhsPacket A0, A1; 00644 RhsPacket B_0, B1, B2, B3; 00645 RhsPacket T0; 00646 00647 traits.loadLhs(&blA[0*LhsProgress], A0); 00648 traits.loadLhs(&blA[1*LhsProgress], A1); 00649 traits.loadRhs(&blB[0*RhsProgress], B_0); 00650 traits.loadRhs(&blB[1*RhsProgress], B1); 00651 00652 traits.madd(A0,B_0,C0,T0); 00653 traits.loadRhs(&blB[2*RhsProgress], B2); 00654 traits.madd(A1,B_0,C4,B_0); 00655 traits.loadRhs(&blB[3*RhsProgress], B3); 00656 traits.loadRhs(&blB[4*RhsProgress], B_0); 00657 traits.madd(A0,B1,C1,T0); 00658 traits.madd(A1,B1,C5,B1); 00659 traits.loadRhs(&blB[5*RhsProgress], B1); 00660 traits.madd(A0,B2,C2,T0); 00661 traits.madd(A1,B2,C6,B2); 00662 traits.loadRhs(&blB[6*RhsProgress], B2); 00663 traits.madd(A0,B3,C3,T0); 00664 traits.loadLhs(&blA[2*LhsProgress], A0); 00665 traits.madd(A1,B3,C7,B3); 00666 traits.loadLhs(&blA[3*LhsProgress], A1); 00667 traits.loadRhs(&blB[7*RhsProgress], B3); 00668 traits.madd(A0,B_0,C0,T0); 00669 traits.madd(A1,B_0,C4,B_0); 00670 traits.loadRhs(&blB[8*RhsProgress], B_0); 00671 traits.madd(A0,B1,C1,T0); 00672 traits.madd(A1,B1,C5,B1); 00673 traits.loadRhs(&blB[9*RhsProgress], B1); 00674 traits.madd(A0,B2,C2,T0); 00675 traits.madd(A1,B2,C6,B2); 00676 traits.loadRhs(&blB[10*RhsProgress], B2); 00677 traits.madd(A0,B3,C3,T0); 00678 traits.loadLhs(&blA[4*LhsProgress], A0); 00679 traits.madd(A1,B3,C7,B3); 00680 traits.loadLhs(&blA[5*LhsProgress], A1); 00681 traits.loadRhs(&blB[11*RhsProgress], B3); 00682 00683 traits.madd(A0,B_0,C0,T0); 00684 traits.madd(A1,B_0,C4,B_0); 00685 traits.loadRhs(&blB[12*RhsProgress], B_0); 00686 traits.madd(A0,B1,C1,T0); 00687 traits.madd(A1,B1,C5,B1); 00688 traits.loadRhs(&blB[13*RhsProgress], B1); 00689 traits.madd(A0,B2,C2,T0); 00690 traits.madd(A1,B2,C6,B2); 00691 traits.loadRhs(&blB[14*RhsProgress], B2); 00692 traits.madd(A0,B3,C3,T0); 00693 traits.loadLhs(&blA[6*LhsProgress], A0); 00694 traits.madd(A1,B3,C7,B3); 00695 traits.loadLhs(&blA[7*LhsProgress], A1); 00696 traits.loadRhs(&blB[15*RhsProgress], B3); 00697 traits.madd(A0,B_0,C0,T0); 00698 traits.madd(A1,B_0,C4,B_0); 00699 traits.madd(A0,B1,C1,T0); 00700 traits.madd(A1,B1,C5,B1); 00701 traits.madd(A0,B2,C2,T0); 00702 traits.madd(A1,B2,C6,B2); 00703 traits.madd(A0,B3,C3,T0); 00704 traits.madd(A1,B3,C7,B3); 00705 } 00706 00707 blB += 4*nr*RhsProgress; 00708 blA += 4*mr; 00709 } 00710 // process remaining peeled loop 00711 for(Index k=peeled_kc; k<depth; k++) 00712 { 00713 if(nr==2) 00714 { 00715 LhsPacket A0, A1; 00716 RhsPacket B_0; 00717 RhsPacket T0; 00718 00719 traits.loadLhs(&blA[0*LhsProgress], A0); 00720 traits.loadLhs(&blA[1*LhsProgress], A1); 00721 traits.loadRhs(&blB[0*RhsProgress], B_0); 00722 traits.madd(A0,B_0,C0,T0); 00723 traits.madd(A1,B_0,C4,B_0); 00724 traits.loadRhs(&blB[1*RhsProgress], B_0); 00725 traits.madd(A0,B_0,C1,T0); 00726 traits.madd(A1,B_0,C5,B_0); 00727 } 00728 else 00729 { 00730 LhsPacket A0, A1; 00731 RhsPacket B_0, B1, B2, B3; 00732 RhsPacket T0; 00733 00734 traits.loadLhs(&blA[0*LhsProgress], A0); 00735 traits.loadLhs(&blA[1*LhsProgress], A1); 00736 traits.loadRhs(&blB[0*RhsProgress], B_0); 00737 traits.loadRhs(&blB[1*RhsProgress], B1); 00738 00739 traits.madd(A0,B_0,C0,T0); 00740 traits.loadRhs(&blB[2*RhsProgress], B2); 00741 traits.madd(A1,B_0,C4,B_0); 00742 traits.loadRhs(&blB[3*RhsProgress], B3); 00743 traits.madd(A0,B1,C1,T0); 00744 traits.madd(A1,B1,C5,B1); 00745 traits.madd(A0,B2,C2,T0); 00746 traits.madd(A1,B2,C6,B2); 00747 traits.madd(A0,B3,C3,T0); 00748 traits.madd(A1,B3,C7,B3); 00749 } 00750 00751 blB += nr*RhsProgress; 00752 blA += mr; 00753 } 00754 00755 if(nr==4) 00756 { 00757 ResPacket R0, R1, R2, R3, R4, R5, R6; 00758 ResPacket alphav = pset1<ResPacket>(alpha); 00759 00760 R0 = ploadu<ResPacket>(r0); 00761 R1 = ploadu<ResPacket>(r1); 00762 R2 = ploadu<ResPacket>(r2); 00763 R3 = ploadu<ResPacket>(r3); 00764 R4 = ploadu<ResPacket>(r0 + ResPacketSize); 00765 R5 = ploadu<ResPacket>(r1 + ResPacketSize); 00766 R6 = ploadu<ResPacket>(r2 + ResPacketSize); 00767 traits.acc(C0, alphav, R0); 00768 pstoreu(r0, R0); 00769 R0 = ploadu<ResPacket>(r3 + ResPacketSize); 00770 00771 traits.acc(C1, alphav, R1); 00772 traits.acc(C2, alphav, R2); 00773 traits.acc(C3, alphav, R3); 00774 traits.acc(C4, alphav, R4); 00775 traits.acc(C5, alphav, R5); 00776 traits.acc(C6, alphav, R6); 00777 traits.acc(C7, alphav, R0); 00778 00779 pstoreu(r1, R1); 00780 pstoreu(r2, R2); 00781 pstoreu(r3, R3); 00782 pstoreu(r0 + ResPacketSize, R4); 00783 pstoreu(r1 + ResPacketSize, R5); 00784 pstoreu(r2 + ResPacketSize, R6); 00785 pstoreu(r3 + ResPacketSize, R0); 00786 } 00787 else 00788 { 00789 ResPacket R0, R1, R4; 00790 ResPacket alphav = pset1<ResPacket>(alpha); 00791 00792 R0 = ploadu<ResPacket>(r0); 00793 R1 = ploadu<ResPacket>(r1); 00794 R4 = ploadu<ResPacket>(r0 + ResPacketSize); 00795 traits.acc(C0, alphav, R0); 00796 pstoreu(r0, R0); 00797 R0 = ploadu<ResPacket>(r1 + ResPacketSize); 00798 traits.acc(C1, alphav, R1); 00799 traits.acc(C4, alphav, R4); 00800 traits.acc(C5, alphav, R0); 00801 pstoreu(r1, R1); 00802 pstoreu(r0 + ResPacketSize, R4); 00803 pstoreu(r1 + ResPacketSize, R0); 00804 } 00805 00806 } 00807 00808 if(rows-peeled_mc>=LhsProgress) 00809 { 00810 Index i = peeled_mc; 00811 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; 00812 prefetch(&blA[0]); 00813 00814 // gets res block as register 00815 AccPacket C0, C1, C2, C3; 00816 traits.initAcc(C0); 00817 traits.initAcc(C1); 00818 if(nr==4) traits.initAcc(C2); 00819 if(nr==4) traits.initAcc(C3); 00820 00821 // performs "inner" product 00822 const RhsScalar* blB = unpackedB; 00823 for(Index k=0; k<peeled_kc; k+=4) 00824 { 00825 if(nr==2) 00826 { 00827 LhsPacket A0; 00828 RhsPacket B_0, B1; 00829 00830 traits.loadLhs(&blA[0*LhsProgress], A0); 00831 traits.loadRhs(&blB[0*RhsProgress], B_0); 00832 traits.loadRhs(&blB[1*RhsProgress], B1); 00833 traits.madd(A0,B_0,C0,B_0); 00834 traits.loadRhs(&blB[2*RhsProgress], B_0); 00835 traits.madd(A0,B1,C1,B1); 00836 traits.loadLhs(&blA[1*LhsProgress], A0); 00837 traits.loadRhs(&blB[3*RhsProgress], B1); 00838 traits.madd(A0,B_0,C0,B_0); 00839 traits.loadRhs(&blB[4*RhsProgress], B_0); 00840 traits.madd(A0,B1,C1,B1); 00841 traits.loadLhs(&blA[2*LhsProgress], A0); 00842 traits.loadRhs(&blB[5*RhsProgress], B1); 00843 traits.madd(A0,B_0,C0,B_0); 00844 traits.loadRhs(&blB[6*RhsProgress], B_0); 00845 traits.madd(A0,B1,C1,B1); 00846 traits.loadLhs(&blA[3*LhsProgress], A0); 00847 traits.loadRhs(&blB[7*RhsProgress], B1); 00848 traits.madd(A0,B_0,C0,B_0); 00849 traits.madd(A0,B1,C1,B1); 00850 } 00851 else 00852 { 00853 LhsPacket A0; 00854 RhsPacket B_0, B1, B2, B3; 00855 00856 traits.loadLhs(&blA[0*LhsProgress], A0); 00857 traits.loadRhs(&blB[0*RhsProgress], B_0); 00858 traits.loadRhs(&blB[1*RhsProgress], B1); 00859 00860 traits.madd(A0,B_0,C0,B_0); 00861 traits.loadRhs(&blB[2*RhsProgress], B2); 00862 traits.loadRhs(&blB[3*RhsProgress], B3); 00863 traits.loadRhs(&blB[4*RhsProgress], B_0); 00864 traits.madd(A0,B1,C1,B1); 00865 traits.loadRhs(&blB[5*RhsProgress], B1); 00866 traits.madd(A0,B2,C2,B2); 00867 traits.loadRhs(&blB[6*RhsProgress], B2); 00868 traits.madd(A0,B3,C3,B3); 00869 traits.loadLhs(&blA[1*LhsProgress], A0); 00870 traits.loadRhs(&blB[7*RhsProgress], B3); 00871 traits.madd(A0,B_0,C0,B_0); 00872 traits.loadRhs(&blB[8*RhsProgress], B_0); 00873 traits.madd(A0,B1,C1,B1); 00874 traits.loadRhs(&blB[9*RhsProgress], B1); 00875 traits.madd(A0,B2,C2,B2); 00876 traits.loadRhs(&blB[10*RhsProgress], B2); 00877 traits.madd(A0,B3,C3,B3); 00878 traits.loadLhs(&blA[2*LhsProgress], A0); 00879 traits.loadRhs(&blB[11*RhsProgress], B3); 00880 00881 traits.madd(A0,B_0,C0,B_0); 00882 traits.loadRhs(&blB[12*RhsProgress], B_0); 00883 traits.madd(A0,B1,C1,B1); 00884 traits.loadRhs(&blB[13*RhsProgress], B1); 00885 traits.madd(A0,B2,C2,B2); 00886 traits.loadRhs(&blB[14*RhsProgress], B2); 00887 traits.madd(A0,B3,C3,B3); 00888 00889 traits.loadLhs(&blA[3*LhsProgress], A0); 00890 traits.loadRhs(&blB[15*RhsProgress], B3); 00891 traits.madd(A0,B_0,C0,B_0); 00892 traits.madd(A0,B1,C1,B1); 00893 traits.madd(A0,B2,C2,B2); 00894 traits.madd(A0,B3,C3,B3); 00895 } 00896 00897 blB += nr*4*RhsProgress; 00898 blA += 4*LhsProgress; 00899 } 00900 // process remaining peeled loop 00901 for(Index k=peeled_kc; k<depth; k++) 00902 { 00903 if(nr==2) 00904 { 00905 LhsPacket A0; 00906 RhsPacket B_0, B1; 00907 00908 traits.loadLhs(&blA[0*LhsProgress], A0); 00909 traits.loadRhs(&blB[0*RhsProgress], B_0); 00910 traits.loadRhs(&blB[1*RhsProgress], B1); 00911 traits.madd(A0,B_0,C0,B_0); 00912 traits.madd(A0,B1,C1,B1); 00913 } 00914 else 00915 { 00916 LhsPacket A0; 00917 RhsPacket B_0, B1, B2, B3; 00918 00919 traits.loadLhs(&blA[0*LhsProgress], A0); 00920 traits.loadRhs(&blB[0*RhsProgress], B_0); 00921 traits.loadRhs(&blB[1*RhsProgress], B1); 00922 traits.loadRhs(&blB[2*RhsProgress], B2); 00923 traits.loadRhs(&blB[3*RhsProgress], B3); 00924 00925 traits.madd(A0,B_0,C0,B_0); 00926 traits.madd(A0,B1,C1,B1); 00927 traits.madd(A0,B2,C2,B2); 00928 traits.madd(A0,B3,C3,B3); 00929 } 00930 00931 blB += nr*RhsProgress; 00932 blA += LhsProgress; 00933 } 00934 00935 ResPacket R0, R1, R2, R3; 00936 ResPacket alphav = pset1<ResPacket>(alpha); 00937 00938 ResScalar* r0 = &res[(j2+0)*resStride + i]; 00939 ResScalar* r1 = r0 + resStride; 00940 ResScalar* r2 = r1 + resStride; 00941 ResScalar* r3 = r2 + resStride; 00942 00943 R0 = ploadu<ResPacket>(r0); 00944 R1 = ploadu<ResPacket>(r1); 00945 if(nr==4) R2 = ploadu<ResPacket>(r2); 00946 if(nr==4) R3 = ploadu<ResPacket>(r3); 00947 00948 traits.acc(C0, alphav, R0); 00949 traits.acc(C1, alphav, R1); 00950 if(nr==4) traits.acc(C2, alphav, R2); 00951 if(nr==4) traits.acc(C3, alphav, R3); 00952 00953 pstoreu(r0, R0); 00954 pstoreu(r1, R1); 00955 if(nr==4) pstoreu(r2, R2); 00956 if(nr==4) pstoreu(r3, R3); 00957 } 00958 for(Index i=peeled_mc2; i<rows; i++) 00959 { 00960 const LhsScalar* blA = &blockA[i*strideA+offsetA]; 00961 prefetch(&blA[0]); 00962 00963 // gets a 1 x nr res block as registers 00964 ResScalar C0(0), C1(0), C2(0), C3(0); 00965 // TODO directly use blockB ??? 00966 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; 00967 for(Index k=0; k<depth; k++) 00968 { 00969 if(nr==2) 00970 { 00971 LhsScalar A0; 00972 RhsScalar B_0, B1; 00973 00974 A0 = blA[k]; 00975 B_0 = blB[0]; 00976 B1 = blB[1]; 00977 MADD(cj,A0,B_0,C0,B_0); 00978 MADD(cj,A0,B1,C1,B1); 00979 } 00980 else 00981 { 00982 LhsScalar A0; 00983 RhsScalar B_0, B1, B2, B3; 00984 00985 A0 = blA[k]; 00986 B_0 = blB[0]; 00987 B1 = blB[1]; 00988 B2 = blB[2]; 00989 B3 = blB[3]; 00990 00991 MADD(cj,A0,B_0,C0,B_0); 00992 MADD(cj,A0,B1,C1,B1); 00993 MADD(cj,A0,B2,C2,B2); 00994 MADD(cj,A0,B3,C3,B3); 00995 } 00996 00997 blB += nr; 00998 } 00999 res[(j2+0)*resStride + i] += alpha*C0; 01000 res[(j2+1)*resStride + i] += alpha*C1; 01001 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; 01002 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; 01003 } 01004 } 01005 // process remaining rhs/res columns one at a time 01006 // => do the same but with nr==1 01007 for(Index j2=packet_cols; j2<cols; j2++) 01008 { 01009 // unpack B 01010 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); 01011 01012 for(Index i=0; i<peeled_mc; i+=mr) 01013 { 01014 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; 01015 prefetch(&blA[0]); 01016 01017 // TODO move the res loads to the stores 01018 01019 // get res block as registers 01020 AccPacket C0, C4; 01021 traits.initAcc(C0); 01022 traits.initAcc(C4); 01023 01024 const RhsScalar* blB = unpackedB; 01025 for(Index k=0; k<depth; k++) 01026 { 01027 LhsPacket A0, A1; 01028 RhsPacket B_0; 01029 RhsPacket T0; 01030 01031 traits.loadLhs(&blA[0*LhsProgress], A0); 01032 traits.loadLhs(&blA[1*LhsProgress], A1); 01033 traits.loadRhs(&blB[0*RhsProgress], B_0); 01034 traits.madd(A0,B_0,C0,T0); 01035 traits.madd(A1,B_0,C4,B_0); 01036 01037 blB += RhsProgress; 01038 blA += 2*LhsProgress; 01039 } 01040 ResPacket R0, R4; 01041 ResPacket alphav = pset1<ResPacket>(alpha); 01042 01043 ResScalar* r0 = &res[(j2+0)*resStride + i]; 01044 01045 R0 = ploadu<ResPacket>(r0); 01046 R4 = ploadu<ResPacket>(r0+ResPacketSize); 01047 01048 traits.acc(C0, alphav, R0); 01049 traits.acc(C4, alphav, R4); 01050 01051 pstoreu(r0, R0); 01052 pstoreu(r0+ResPacketSize, R4); 01053 } 01054 if(rows-peeled_mc>=LhsProgress) 01055 { 01056 Index i = peeled_mc; 01057 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; 01058 prefetch(&blA[0]); 01059 01060 AccPacket C0; 01061 traits.initAcc(C0); 01062 01063 const RhsScalar* blB = unpackedB; 01064 for(Index k=0; k<depth; k++) 01065 { 01066 LhsPacket A0; 01067 RhsPacket B_0; 01068 traits.loadLhs(blA, A0); 01069 traits.loadRhs(blB, B_0); 01070 traits.madd(A0, B_0, C0, B_0); 01071 blB += RhsProgress; 01072 blA += LhsProgress; 01073 } 01074 01075 ResPacket alphav = pset1<ResPacket>(alpha); 01076 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); 01077 traits.acc(C0, alphav, R0); 01078 pstoreu(&res[(j2+0)*resStride + i], R0); 01079 } 01080 for(Index i=peeled_mc2; i<rows; i++) 01081 { 01082 const LhsScalar* blA = &blockA[i*strideA+offsetA]; 01083 prefetch(&blA[0]); 01084 01085 // gets a 1 x 1 res block as registers 01086 ResScalar C0(0); 01087 // FIXME directly use blockB ?? 01088 const RhsScalar* blB = &blockB[j2*strideB+offsetB]; 01089 for(Index k=0; k<depth; k++) 01090 { 01091 LhsScalar A0 = blA[k]; 01092 RhsScalar B_0 = blB[k]; 01093 MADD(cj, A0, B_0, C0, B_0); 01094 } 01095 res[(j2+0)*resStride + i] += alpha*C0; 01096 } 01097 } 01098 } 01099 01100 01101 #undef CJMADD 01102 01103 // pack a block of the lhs 01104 // The traversal is as follow (mr==4): 01105 // 0 4 8 12 ... 01106 // 1 5 9 13 ... 01107 // 2 6 10 14 ... 01108 // 3 7 11 15 ... 01109 // 01110 // 16 20 24 28 ... 01111 // 17 21 25 29 ... 01112 // 18 22 26 30 ... 01113 // 19 23 27 31 ... 01114 // 01115 // 32 33 34 35 ... 01116 // 36 36 38 39 ... 01117 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> 01118 struct gemm_pack_lhs 01119 { 01120 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); 01121 }; 01122 01123 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> 01124 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode> 01125 ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) 01126 { 01127 typedef typename packet_traits<Scalar>::type Packet; 01128 enum { PacketSize = packet_traits<Scalar>::size }; 01129 01130 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); 01131 EIGEN_UNUSED_VARIABLE(stride) 01132 EIGEN_UNUSED_VARIABLE(offset) 01133 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 01134 eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); 01135 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 01136 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); 01137 Index count = 0; 01138 Index peeled_mc = (rows/Pack1)*Pack1; 01139 for(Index i=0; i<peeled_mc; i+=Pack1) 01140 { 01141 if(PanelMode) count += Pack1 * offset; 01142 01143 if(StorageOrder==ColMajor) 01144 { 01145 for(Index k=0; k<depth; k++) 01146 { 01147 Packet A, B, C, D; 01148 if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); 01149 if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); 01150 if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); 01151 if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); 01152 if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } 01153 if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } 01154 if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } 01155 if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } 01156 } 01157 } 01158 else 01159 { 01160 for(Index k=0; k<depth; k++) 01161 { 01162 // TODO add a vectorized transpose here 01163 Index w=0; 01164 for(; w<Pack1-3; w+=4) 01165 { 01166 Scalar a(cj(lhs(i+w+0, k))), 01167 b(cj(lhs(i+w+1, k))), 01168 c(cj(lhs(i+w+2, k))), 01169 d(cj(lhs(i+w+3, k))); 01170 blockA[count++] = a; 01171 blockA[count++] = b; 01172 blockA[count++] = c; 01173 blockA[count++] = d; 01174 } 01175 if(Pack1%4) 01176 for(;w<Pack1;++w) 01177 blockA[count++] = cj(lhs(i+w, k)); 01178 } 01179 } 01180 if(PanelMode) count += Pack1 * (stride-offset-depth); 01181 } 01182 if(rows-peeled_mc>=Pack2) 01183 { 01184 if(PanelMode) count += Pack2*offset; 01185 for(Index k=0; k<depth; k++) 01186 for(Index w=0; w<Pack2; w++) 01187 blockA[count++] = cj(lhs(peeled_mc+w, k)); 01188 if(PanelMode) count += Pack2 * (stride-offset-depth); 01189 peeled_mc += Pack2; 01190 } 01191 for(Index i=peeled_mc; i<rows; i++) 01192 { 01193 if(PanelMode) count += offset; 01194 for(Index k=0; k<depth; k++) 01195 blockA[count++] = cj(lhs(i, k)); 01196 if(PanelMode) count += (stride-offset-depth); 01197 } 01198 } 01199 01200 // copy a complete panel of the rhs 01201 // this version is optimized for column major matrices 01202 // The traversal order is as follow: (nr==4): 01203 // 0 1 2 3 12 13 14 15 24 27 01204 // 4 5 6 7 16 17 18 19 25 28 01205 // 8 9 10 11 20 21 22 23 26 29 01206 // . . . . . . . . . . 01207 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01208 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> 01209 { 01210 typedef typename packet_traits<Scalar>::type Packet; 01211 enum { PacketSize = packet_traits<Scalar>::size }; 01212 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); 01213 }; 01214 01215 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01216 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> 01217 ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) 01218 { 01219 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); 01220 EIGEN_UNUSED_VARIABLE(stride) 01221 EIGEN_UNUSED_VARIABLE(offset) 01222 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 01223 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 01224 Index packet_cols = (cols/nr) * nr; 01225 Index count = 0; 01226 for(Index j2=0; j2<packet_cols; j2+=nr) 01227 { 01228 // skip what we have before 01229 if(PanelMode) count += nr * offset; 01230 const Scalar* b0 = &rhs[(j2+0)*rhsStride]; 01231 const Scalar* b1 = &rhs[(j2+1)*rhsStride]; 01232 const Scalar* b2 = &rhs[(j2+2)*rhsStride]; 01233 const Scalar* b3 = &rhs[(j2+3)*rhsStride]; 01234 for(Index k=0; k<depth; k++) 01235 { 01236 blockB[count+0] = cj(b0[k]); 01237 blockB[count+1] = cj(b1[k]); 01238 if(nr==4) blockB[count+2] = cj(b2[k]); 01239 if(nr==4) blockB[count+3] = cj(b3[k]); 01240 count += nr; 01241 } 01242 // skip what we have after 01243 if(PanelMode) count += nr * (stride-offset-depth); 01244 } 01245 01246 // copy the remaining columns one at a time (nr==1) 01247 for(Index j2=packet_cols; j2<cols; ++j2) 01248 { 01249 if(PanelMode) count += offset; 01250 const Scalar* b0 = &rhs[(j2+0)*rhsStride]; 01251 for(Index k=0; k<depth; k++) 01252 { 01253 blockB[count] = cj(b0[k]); 01254 count += 1; 01255 } 01256 if(PanelMode) count += (stride-offset-depth); 01257 } 01258 } 01259 01260 // this version is optimized for row major matrices 01261 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01262 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> 01263 { 01264 enum { PacketSize = packet_traits<Scalar>::size }; 01265 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); 01266 }; 01267 01268 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 01269 EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> 01270 ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) 01271 { 01272 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); 01273 EIGEN_UNUSED_VARIABLE(stride) 01274 EIGEN_UNUSED_VARIABLE(offset) 01275 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 01276 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 01277 Index packet_cols = (cols/nr) * nr; 01278 Index count = 0; 01279 for(Index j2=0; j2<packet_cols; j2+=nr) 01280 { 01281 // skip what we have before 01282 if(PanelMode) count += nr * offset; 01283 for(Index k=0; k<depth; k++) 01284 { 01285 const Scalar* b0 = &rhs[k*rhsStride + j2]; 01286 blockB[count+0] = cj(b0[0]); 01287 blockB[count+1] = cj(b0[1]); 01288 if(nr==4) blockB[count+2] = cj(b0[2]); 01289 if(nr==4) blockB[count+3] = cj(b0[3]); 01290 count += nr; 01291 } 01292 // skip what we have after 01293 if(PanelMode) count += nr * (stride-offset-depth); 01294 } 01295 // copy the remaining columns one at a time (nr==1) 01296 for(Index j2=packet_cols; j2<cols; ++j2) 01297 { 01298 if(PanelMode) count += offset; 01299 const Scalar* b0 = &rhs[j2]; 01300 for(Index k=0; k<depth; k++) 01301 { 01302 blockB[count] = cj(b0[k*rhsStride]); 01303 count += 1; 01304 } 01305 if(PanelMode) count += stride-offset-depth; 01306 } 01307 } 01308 01309 } // end namespace internal 01310 01311 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. 01312 * \sa setCpuCacheSize */ 01313 inline std::ptrdiff_t l1CacheSize () 01314 { 01315 std::ptrdiff_t l1, l2; 01316 internal::manage_caching_sizes(GetAction, &l1, &l2); 01317 return l1; 01318 } 01319 01320 /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. 01321 * \sa setCpuCacheSize */ 01322 inline std::ptrdiff_t l2CacheSize () 01323 { 01324 std::ptrdiff_t l1, l2; 01325 internal::manage_caching_sizes(GetAction, &l1, &l2); 01326 return l2; 01327 } 01328 01329 /** Set the cpu L1 and L2 cache sizes (in bytes). 01330 * These values are use to adjust the size of the blocks 01331 * for the algorithms working per blocks. 01332 * 01333 * \sa computeProductBlockingSizes */ 01334 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) 01335 { 01336 internal::manage_caching_sizes(SetAction, &l1, &l2); 01337 } 01338 01339 } // end namespace Eigen 01340 01341 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Generated on Thu Nov 17 2022 22:01:28 by
1.7.2