Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project-sd-card by
matmul.cpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 00016 // Third party copyrights are property of their respective owners. 00017 // 00018 // Redistribution and use in source and binary forms, with or without modification, 00019 // are permitted provided that the following conditions are met: 00020 // 00021 // * Redistribution's of source code must retain the above copyright notice, 00022 // this list of conditions and the following disclaimer. 00023 // 00024 // * Redistribution's in binary form must reproduce the above copyright notice, 00025 // this list of conditions and the following disclaimer in the documentation 00026 // and/or other materials provided with the distribution. 00027 // 00028 // * The name of the copyright holders may not be used to endorse or promote products 00029 // derived from this software without specific prior written permission. 00030 // 00031 // This software is provided by the copyright holders and contributors "as is" and 00032 // any express or implied warranties, including, but not limited to, the implied 00033 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00034 // In no event shall the Intel Corporation or contributors be liable for any direct, 00035 // indirect, incidental, special, exemplary, or consequential damages 00036 // (including, but not limited to, procurement of substitute goods or services; 00037 // loss of use, data, or profits; or business interruption) however caused 00038 // and on any theory of liability, whether in contract, strict liability, 00039 // or tort (including negligence or otherwise) arising in any way out of 00040 // the use of this software, even if advised of the possibility of such damage. 00041 // 00042 //M*/ 00043 00044 #include "precomp.hpp" 00045 #include "opencl_kernels_core.hpp" 00046 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" 00047 00048 namespace cv 00049 { 00050 00051 /****************************************************************************************\ 00052 * GEMM * 00053 \****************************************************************************************/ 00054 00055 static void 00056 GEMM_CopyBlock( const uchar* src, size_t src_step, 00057 uchar* dst, size_t dst_step, 00058 Size size, size_t pix_size ) 00059 { 00060 int j; 00061 size.width *= (int)(pix_size / sizeof(int)); 00062 00063 for( ; size.height--; src += src_step, dst += dst_step ) 00064 { 00065 j=0; 00066 #if CV_ENABLE_UNROLLED 00067 for( ; j <= size.width - 4; j += 4 ) 00068 { 00069 int t0 = ((const int*)src)[j]; 00070 int t1 = ((const int*)src)[j+1]; 00071 ((int*)dst)[j] = t0; 00072 ((int*)dst)[j+1] = t1; 00073 t0 = ((const int*)src)[j+2]; 00074 t1 = ((const int*)src)[j+3]; 00075 ((int*)dst)[j+2] = t0; 00076 ((int*)dst)[j+3] = t1; 00077 } 00078 #endif 00079 for( ; j < size.width; j++ ) 00080 ((int*)dst)[j] = ((const int*)src)[j]; 00081 } 00082 } 00083 00084 00085 static void 00086 GEMM_TransposeBlock( const uchar* src, size_t src_step, 00087 uchar* dst, size_t dst_step, 00088 Size size, size_t pix_size ) 00089 { 00090 int i, j; 00091 for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size ) 00092 { 00093 const uchar* _src = src; 00094 switch( pix_size ) 00095 { 00096 case sizeof(int): 00097 for( j = 0; j < size.height; j++, _src += src_step ) 00098 ((int*)dst)[j] = ((int*)_src)[0]; 00099 break; 00100 case sizeof(int)*2: 00101 for( j = 0; j < size.height*2; j += 2, _src += src_step ) 00102 { 00103 int t0 = ((int*)_src)[0]; 00104 int t1 = ((int*)_src)[1]; 00105 ((int*)dst)[j] = t0; 00106 ((int*)dst)[j+1] = t1; 00107 } 00108 break; 00109 case sizeof(int)*4: 00110 for( j = 0; j < size.height*4; j += 4, _src += src_step ) 00111 { 00112 int t0 = ((int*)_src)[0]; 00113 int t1 = ((int*)_src)[1]; 00114 ((int*)dst)[j] = t0; 00115 ((int*)dst)[j+1] = t1; 00116 t0 = ((int*)_src)[2]; 00117 t1 = ((int*)_src)[3]; 00118 ((int*)dst)[j+2] = t0; 00119 ((int*)dst)[j+3] = t1; 00120 } 00121 break; 00122 default: 00123 assert(0); 00124 return; 00125 } 00126 } 00127 } 00128 00129 00130 template<typename T, typename WT> static void 00131 GEMMSingleMul( const T* a_data, size_t a_step, 00132 const T* b_data, size_t b_step, 00133 const T* c_data, size_t c_step, 00134 T* d_data, size_t d_step, 00135 Size a_size, Size d_size, 00136 double alpha, double beta, int flags ) 00137 { 00138 int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height; 00139 const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data; 00140 cv::AutoBuffer<T> _a_buf; 00141 T* a_buf = 0; 00142 size_t a_step0, a_step1, c_step0, c_step1, t_step; 00143 00144 a_step /= sizeof(a_data[0]); 00145 b_step /= sizeof(b_data[0]); 00146 c_step /= sizeof(c_data[0]); 00147 d_step /= sizeof(d_data[0]); 00148 a_step0 = a_step; 00149 a_step1 = 1; 00150 00151 if( !c_data ) 00152 c_step0 = c_step1 = 0; 00153 else if( !(flags & GEMM_3_T) ) 00154 c_step0 = c_step, c_step1 = 1; 00155 else 00156 c_step0 = 1, c_step1 = c_step; 00157 00158 if( flags & GEMM_1_T ) 00159 { 00160 CV_SWAP( a_step0, a_step1, t_step ); 00161 n = a_size.height; 00162 if( a_step > 1 && n > 1 ) 00163 { 00164 _a_buf.allocate(n); 00165 a_buf = _a_buf; 00166 } 00167 } 00168 00169 if( n == 1 ) /* external product */ 00170 { 00171 cv::AutoBuffer<T> _b_buf; 00172 T* b_buf = 0; 00173 00174 if( a_step > 1 && a_size.height > 1 ) 00175 { 00176 _a_buf.allocate(drows); 00177 a_buf = _a_buf; 00178 for( k = 0; k < drows; k++ ) 00179 a_buf[k] = a_data[a_step*k]; 00180 a_data = a_buf; 00181 } 00182 00183 if( b_step > 1 ) 00184 { 00185 _b_buf.allocate(d_size.width); 00186 b_buf = _b_buf; 00187 for( j = 0; j < d_size.width; j++ ) 00188 b_buf[j] = b_data[j*b_step]; 00189 b_data = b_buf; 00190 } 00191 00192 for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step ) 00193 { 00194 WT al = WT(a_data[i])*alpha; 00195 c_data = _c_data; 00196 for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 ) 00197 { 00198 WT s0 = al*WT(b_data[j]); 00199 WT s1 = al*WT(b_data[j+1]); 00200 if( !c_data ) 00201 { 00202 d_data[j] = T(s0); 00203 d_data[j+1] = T(s1); 00204 } 00205 else 00206 { 00207 d_data[j] = T(s0 + WT(c_data[0])*beta); 00208 d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta); 00209 } 00210 } 00211 00212 for( ; j < d_size.width; j++, c_data += c_step1 ) 00213 { 00214 WT s0 = al*WT(b_data[j]); 00215 if( !c_data ) 00216 d_data[j] = T(s0); 00217 else 00218 d_data[j] = T(s0 + WT(c_data[0])*beta); 00219 } 00220 } 00221 } 00222 else if( flags & GEMM_2_T ) /* A * Bt */ 00223 { 00224 for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step ) 00225 { 00226 a_data = _a_data; 00227 b_data = _b_data; 00228 c_data = _c_data; 00229 00230 if( a_buf ) 00231 { 00232 for( k = 0; k < n; k++ ) 00233 a_buf[k] = a_data[a_step1*k]; 00234 a_data = a_buf; 00235 } 00236 00237 for( j = 0; j < d_size.width; j++, b_data += b_step, 00238 c_data += c_step1 ) 00239 { 00240 WT s0(0), s1(0), s2(0), s3(0); 00241 k = 0; 00242 #if CV_ENABLE_UNROLLED 00243 for( ; k <= n - 4; k += 4 ) 00244 { 00245 s0 += WT(a_data[k])*WT(b_data[k]); 00246 s1 += WT(a_data[k+1])*WT(b_data[k+1]); 00247 s2 += WT(a_data[k+2])*WT(b_data[k+2]); 00248 s3 += WT(a_data[k+3])*WT(b_data[k+3]); 00249 } 00250 #endif 00251 for( ; k < n; k++ ) 00252 s0 += WT(a_data[k])*WT(b_data[k]); 00253 s0 = (s0+s1+s2+s3)*alpha; 00254 00255 if( !c_data ) 00256 d_data[j] = T(s0); 00257 else 00258 d_data[j] = T(s0 + WT(c_data[0])*beta); 00259 } 00260 } 00261 } 00262 else if( d_size.width*sizeof(d_data[0]) <= 1600 ) 00263 { 00264 for( i = 0; i < drows; i++, _a_data += a_step0, 00265 _c_data += c_step0, 00266 d_data += d_step ) 00267 { 00268 a_data = _a_data, c_data = _c_data; 00269 00270 if( a_buf ) 00271 { 00272 for( k = 0; k < n; k++ ) 00273 a_buf[k] = a_data[a_step1*k]; 00274 a_data = a_buf; 00275 } 00276 00277 for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 ) 00278 { 00279 const T* b = _b_data + j; 00280 WT s0(0), s1(0), s2(0), s3(0); 00281 00282 for( k = 0; k < n; k++, b += b_step ) 00283 { 00284 WT a(a_data[k]); 00285 s0 += a * WT(b[0]); s1 += a * WT(b[1]); 00286 s2 += a * WT(b[2]); s3 += a * WT(b[3]); 00287 } 00288 00289 if( !c_data ) 00290 { 00291 d_data[j] = T(s0*alpha); 00292 d_data[j+1] = T(s1*alpha); 00293 d_data[j+2] = T(s2*alpha); 00294 d_data[j+3] = T(s3*alpha); 00295 } 00296 else 00297 { 00298 s0 = s0*alpha; s1 = s1*alpha; 00299 s2 = s2*alpha; s3 = s3*alpha; 00300 d_data[j] = T(s0 + WT(c_data[0])*beta); 00301 d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta); 00302 d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta); 00303 d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta); 00304 } 00305 } 00306 00307 for( ; j < m; j++, c_data += c_step1 ) 00308 { 00309 const T* b = _b_data + j; 00310 WT s0(0); 00311 00312 for( k = 0; k < n; k++, b += b_step ) 00313 s0 += WT(a_data[k]) * WT(b[0]); 00314 00315 s0 = s0*alpha; 00316 if( !c_data ) 00317 d_data[j] = T(s0); 00318 else 00319 d_data[j] = T(s0 + WT(c_data[0])*beta); 00320 } 00321 } 00322 } 00323 else 00324 { 00325 cv::AutoBuffer<WT> _d_buf(m); 00326 WT* d_buf = _d_buf; 00327 00328 for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step ) 00329 { 00330 a_data = _a_data; 00331 b_data = _b_data; 00332 c_data = _c_data; 00333 00334 if( a_buf ) 00335 { 00336 for( k = 0; k < n; k++ ) 00337 a_buf[k] = _a_data[a_step1*k]; 00338 a_data = a_buf; 00339 } 00340 00341 for( j = 0; j < m; j++ ) 00342 d_buf[j] = WT(0); 00343 00344 for( k = 0; k < n; k++, b_data += b_step ) 00345 { 00346 WT al(a_data[k]); 00347 j=0; 00348 #if CV_ENABLE_UNROLLED 00349 for(; j <= m - 4; j += 4 ) 00350 { 00351 WT t0 = d_buf[j] + WT(b_data[j])*al; 00352 WT t1 = d_buf[j+1] + WT(b_data[j+1])*al; 00353 d_buf[j] = t0; 00354 d_buf[j+1] = t1; 00355 t0 = d_buf[j+2] + WT(b_data[j+2])*al; 00356 t1 = d_buf[j+3] + WT(b_data[j+3])*al; 00357 d_buf[j+2] = t0; 00358 d_buf[j+3] = t1; 00359 } 00360 #endif 00361 for( ; j < m; j++ ) 00362 d_buf[j] += WT(b_data[j])*al; 00363 } 00364 00365 if( !c_data ) 00366 for( j = 0; j < m; j++ ) 00367 d_data[j] = T(d_buf[j]*alpha); 00368 else 00369 for( j = 0; j < m; j++, c_data += c_step1 ) 00370 { 00371 WT t = d_buf[j]*alpha; 00372 d_data[j] = T(t + WT(c_data[0])*beta); 00373 } 00374 } 00375 } 00376 } 00377 00378 00379 template<typename T, typename WT> static void 00380 GEMMBlockMul( const T* a_data, size_t a_step, 00381 const T* b_data, size_t b_step, 00382 WT* d_data, size_t d_step, 00383 Size a_size, Size d_size, int flags ) 00384 { 00385 int i, j, k, n = a_size.width, m = d_size.width; 00386 const T *_a_data = a_data, *_b_data = b_data; 00387 cv::AutoBuffer<T> _a_buf; 00388 T* a_buf = 0; 00389 size_t a_step0, a_step1, t_step; 00390 int do_acc = flags & 16; 00391 00392 a_step /= sizeof(a_data[0]); 00393 b_step /= sizeof(b_data[0]); 00394 d_step /= sizeof(d_data[0]); 00395 00396 a_step0 = a_step; 00397 a_step1 = 1; 00398 00399 if( flags & GEMM_1_T ) 00400 { 00401 CV_SWAP( a_step0, a_step1, t_step ); 00402 n = a_size.height; 00403 _a_buf.allocate(n); 00404 a_buf = _a_buf; 00405 } 00406 00407 if( flags & GEMM_2_T ) 00408 { 00409 /* second operand is transposed */ 00410 for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step ) 00411 { 00412 a_data = _a_data; b_data = _b_data; 00413 00414 if( a_buf ) 00415 { 00416 for( k = 0; k < n; k++ ) 00417 a_buf[k] = a_data[a_step1*k]; 00418 a_data = a_buf; 00419 } 00420 00421 for( j = 0; j < d_size.width; j++, b_data += b_step ) 00422 { 00423 WT s0 = do_acc ? d_data[j]:WT(0), s1(0); 00424 for( k = 0; k <= n - 2; k += 2 ) 00425 { 00426 s0 += WT(a_data[k])*WT(b_data[k]); 00427 s1 += WT(a_data[k+1])*WT(b_data[k+1]); 00428 } 00429 00430 for( ; k < n; k++ ) 00431 s0 += WT(a_data[k])*WT(b_data[k]); 00432 00433 d_data[j] = s0 + s1; 00434 } 00435 } 00436 } 00437 else 00438 { 00439 for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step ) 00440 { 00441 a_data = _a_data, b_data = _b_data; 00442 00443 if( a_buf ) 00444 { 00445 for( k = 0; k < n; k++ ) 00446 a_buf[k] = a_data[a_step1*k]; 00447 a_data = a_buf; 00448 } 00449 00450 for( j = 0; j <= m - 4; j += 4 ) 00451 { 00452 WT s0, s1, s2, s3; 00453 const T* b = b_data + j; 00454 00455 if( do_acc ) 00456 { 00457 s0 = d_data[j]; s1 = d_data[j+1]; 00458 s2 = d_data[j+2]; s3 = d_data[j+3]; 00459 } 00460 else 00461 s0 = s1 = s2 = s3 = WT(0); 00462 00463 for( k = 0; k < n; k++, b += b_step ) 00464 { 00465 WT a(a_data[k]); 00466 s0 += a * WT(b[0]); s1 += a * WT(b[1]); 00467 s2 += a * WT(b[2]); s3 += a * WT(b[3]); 00468 } 00469 00470 d_data[j] = s0; d_data[j+1] = s1; 00471 d_data[j+2] = s2; d_data[j+3] = s3; 00472 } 00473 00474 for( ; j < m; j++ ) 00475 { 00476 const T* b = b_data + j; 00477 WT s0 = do_acc ? d_data[j] : WT(0); 00478 00479 for( k = 0; k < n; k++, b += b_step ) 00480 s0 += WT(a_data[k]) * WT(b[0]); 00481 00482 d_data[j] = s0; 00483 } 00484 } 00485 } 00486 } 00487 00488 00489 template<typename T, typename WT> static void 00490 GEMMStore( const T* c_data, size_t c_step, 00491 const WT* d_buf, size_t d_buf_step, 00492 T* d_data, size_t d_step, Size d_size, 00493 double alpha, double beta, int flags ) 00494 { 00495 const T* _c_data = c_data; 00496 int j; 00497 size_t c_step0, c_step1; 00498 00499 c_step /= sizeof(c_data[0]); 00500 d_buf_step /= sizeof(d_buf[0]); 00501 d_step /= sizeof(d_data[0]); 00502 00503 if( !c_data ) 00504 c_step0 = c_step1 = 0; 00505 else if( !(flags & GEMM_3_T) ) 00506 c_step0 = c_step, c_step1 = 1; 00507 else 00508 c_step0 = 1, c_step1 = c_step; 00509 00510 for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step ) 00511 { 00512 if( _c_data ) 00513 { 00514 c_data = _c_data; 00515 j=0; 00516 #if CV_ENABLE_UNROLLED 00517 for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 ) 00518 { 00519 WT t0 = alpha*d_buf[j]; 00520 WT t1 = alpha*d_buf[j+1]; 00521 t0 += beta*WT(c_data[0]); 00522 t1 += beta*WT(c_data[c_step1]); 00523 d_data[j] = T(t0); 00524 d_data[j+1] = T(t1); 00525 t0 = alpha*d_buf[j+2]; 00526 t1 = alpha*d_buf[j+3]; 00527 t0 += beta*WT(c_data[c_step1*2]); 00528 t1 += beta*WT(c_data[c_step1*3]); 00529 d_data[j+2] = T(t0); 00530 d_data[j+3] = T(t1); 00531 } 00532 #endif 00533 for( ; j < d_size.width; j++, c_data += c_step1 ) 00534 { 00535 WT t0 = alpha*d_buf[j]; 00536 d_data[j] = T(t0 + WT(c_data[0])*beta); 00537 } 00538 } 00539 else 00540 { 00541 j = 0; 00542 #if CV_ENABLE_UNROLLED 00543 for( ; j <= d_size.width - 4; j += 4 ) 00544 { 00545 WT t0 = alpha*d_buf[j]; 00546 WT t1 = alpha*d_buf[j+1]; 00547 d_data[j] = T(t0); 00548 d_data[j+1] = T(t1); 00549 t0 = alpha*d_buf[j+2]; 00550 t1 = alpha*d_buf[j+3]; 00551 d_data[j+2] = T(t0); 00552 d_data[j+3] = T(t1); 00553 } 00554 #endif 00555 for( ; j < d_size.width; j++ ) 00556 d_data[j] = T(alpha*d_buf[j]); 00557 } 00558 } 00559 } 00560 00561 00562 typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1, 00563 const void* src2, size_t step2, const void* src3, size_t step3, 00564 void* dst, size_t dststep, Size srcsize, Size dstsize, 00565 double alpha, double beta, int flags ); 00566 00567 typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1, 00568 const void* src2, size_t step2, void* dst, size_t dststep, 00569 Size srcsize, Size dstsize, int flags ); 00570 00571 typedef void (*GEMMStoreFunc)( const void* src1, size_t step1, 00572 const void* src2, size_t step2, void* dst, size_t dststep, 00573 Size dstsize, double alpha, double beta, int flags ); 00574 00575 static void GEMMSingleMul_32f( const float* a_data, size_t a_step, 00576 const float* b_data, size_t b_step, 00577 const float* c_data, size_t c_step, 00578 float* d_data, size_t d_step, 00579 Size a_size, Size d_size, 00580 double alpha, double beta, int flags ) 00581 { 00582 GEMMSingleMul<float,double>(a_data, a_step, b_data, b_step, c_data, 00583 c_step, d_data, d_step, a_size, d_size, 00584 alpha, beta, flags); 00585 } 00586 00587 static void GEMMSingleMul_64f( const double* a_data, size_t a_step, 00588 const double* b_data, size_t b_step, 00589 const double* c_data, size_t c_step, 00590 double* d_data, size_t d_step, 00591 Size a_size, Size d_size, 00592 double alpha, double beta, int flags ) 00593 { 00594 GEMMSingleMul<double,double>(a_data, a_step, b_data, b_step, c_data, 00595 c_step, d_data, d_step, a_size, d_size, 00596 alpha, beta, flags); 00597 } 00598 00599 00600 static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step, 00601 const Complexf* b_data, size_t b_step, 00602 const Complexf* c_data, size_t c_step, 00603 Complexf* d_data, size_t d_step, 00604 Size a_size, Size d_size, 00605 double alpha, double beta, int flags ) 00606 { 00607 GEMMSingleMul<Complexf,Complexd>(a_data, a_step, b_data, b_step, c_data, 00608 c_step, d_data, d_step, a_size, d_size, 00609 alpha, beta, flags); 00610 } 00611 00612 static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step, 00613 const Complexd* b_data, size_t b_step, 00614 const Complexd* c_data, size_t c_step, 00615 Complexd* d_data, size_t d_step, 00616 Size a_size, Size d_size, 00617 double alpha, double beta, int flags ) 00618 { 00619 GEMMSingleMul<Complexd,Complexd>(a_data, a_step, b_data, b_step, c_data, 00620 c_step, d_data, d_step, a_size, d_size, 00621 alpha, beta, flags); 00622 } 00623 00624 static void GEMMBlockMul_32f( const float* a_data, size_t a_step, 00625 const float* b_data, size_t b_step, 00626 double* d_data, size_t d_step, 00627 Size a_size, Size d_size, int flags ) 00628 { 00629 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 00630 } 00631 00632 00633 static void GEMMBlockMul_64f( const double* a_data, size_t a_step, 00634 const double* b_data, size_t b_step, 00635 double* d_data, size_t d_step, 00636 Size a_size, Size d_size, int flags ) 00637 { 00638 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 00639 } 00640 00641 00642 static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step, 00643 const Complexf* b_data, size_t b_step, 00644 Complexd* d_data, size_t d_step, 00645 Size a_size, Size d_size, int flags ) 00646 { 00647 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 00648 } 00649 00650 00651 static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step, 00652 const Complexd* b_data, size_t b_step, 00653 Complexd* d_data, size_t d_step, 00654 Size a_size, Size d_size, int flags ) 00655 { 00656 GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); 00657 } 00658 00659 00660 static void GEMMStore_32f( const float* c_data, size_t c_step, 00661 const double* d_buf, size_t d_buf_step, 00662 float* d_data, size_t d_step, Size d_size, 00663 double alpha, double beta, int flags ) 00664 { 00665 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 00666 } 00667 00668 00669 static void GEMMStore_64f( const double* c_data, size_t c_step, 00670 const double* d_buf, size_t d_buf_step, 00671 double* d_data, size_t d_step, Size d_size, 00672 double alpha, double beta, int flags ) 00673 { 00674 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 00675 } 00676 00677 00678 static void GEMMStore_32fc( const Complexf* c_data, size_t c_step, 00679 const Complexd* d_buf, size_t d_buf_step, 00680 Complexf* d_data, size_t d_step, Size d_size, 00681 double alpha, double beta, int flags ) 00682 { 00683 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 00684 } 00685 00686 00687 static void GEMMStore_64fc( const Complexd* c_data, size_t c_step, 00688 const Complexd* d_buf, size_t d_buf_step, 00689 Complexd* d_data, size_t d_step, Size d_size, 00690 double alpha, double beta, int flags ) 00691 { 00692 GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); 00693 } 00694 00695 #ifdef HAVE_CLAMDBLAS 00696 00697 static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, 00698 InputArray matC, double beta, OutputArray matD, int flags ) 00699 { 00700 int type = matA.type(), esz = CV_ELEM_SIZE(type); 00701 bool haveC = matC.kind() != cv::_InputArray::NONE; 00702 Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); 00703 bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; 00704 00705 if (atrans) 00706 sizeA = Size(sizeA.height, sizeA.width); 00707 if (btrans) 00708 sizeB = Size(sizeB.height, sizeB.width); 00709 if (haveC && ctrans) 00710 sizeC = Size(sizeC.height, sizeC.width); 00711 00712 Size sizeD(sizeB.width, sizeA.height); 00713 00714 CV_Assert( matB.type() == type && (!haveC || matC.type() == type) ); 00715 CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); 00716 00717 matD.create(sizeD, type); 00718 if ( matA.offset() % esz != 0 || matA.step() % esz != 0 || 00719 matB.offset() % esz != 0 || matB.step() % esz != 0 || 00720 (haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) ) 00721 return false; 00722 00723 UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); 00724 if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D)) 00725 { 00726 return false; 00727 } 00728 if (haveC) 00729 { 00730 UMat C = matC.getUMat(); 00731 if (!ocl::internal::isCLBuffer(C)) 00732 return false; 00733 } 00734 if (haveC) 00735 ctrans ? transpose(matC, D) : matC.copyTo(D); 00736 else 00737 D.setTo(Scalar::all(0)); 00738 00739 int M = sizeD.height, N = sizeD.width, K = sizeA.width; 00740 int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz; 00741 int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz; 00742 00743 cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr(); 00744 clAmdBlasTranspose transA = atrans ? clAmdBlasTrans : clAmdBlasNoTrans; 00745 clAmdBlasTranspose transB = btrans ? clAmdBlasTrans : clAmdBlasNoTrans; 00746 clAmdBlasOrder order = clAmdBlasRowMajor; 00747 clAmdBlasStatus status = clAmdBlasSuccess; 00748 00749 if (type == CV_32FC1) 00750 status = clAmdBlasSgemmEx(order, transA, transB, M, N, K, 00751 (cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 00752 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 00753 (cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 00754 1, &clq, 0, NULL, NULL); 00755 else if (type == CV_64FC1) 00756 status = clAmdBlasDgemmEx(order, transA, transB, M, N, K, 00757 alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 00758 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 00759 beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 00760 1, &clq, 0, NULL, NULL); 00761 else if (type == CV_32FC2) 00762 { 00763 cl_float2 alpha_2 = { { (cl_float)alpha, 0 } }; 00764 cl_float2 beta_2 = { { (cl_float)beta, 0 } }; 00765 status = clAmdBlasCgemmEx(order, transA, transB, M, N, K, 00766 alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 00767 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 00768 beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 00769 1, &clq, 0, NULL, NULL); 00770 } 00771 else if (type == CV_64FC2) 00772 { 00773 cl_double2 alpha_2 = { { alpha, 0 } }; 00774 cl_double2 beta_2 = { { beta, 0 } }; 00775 status = clAmdBlasZgemmEx(order, transA, transB, M, N, K, 00776 alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda, 00777 (const cl_mem)B.handle(ACCESS_READ), offb, ldb, 00778 beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc, 00779 1, &clq, 0, NULL, NULL); 00780 } 00781 else 00782 CV_Error(Error::StsUnsupportedFormat, ""); 00783 00784 return status == clAmdBlasSuccess; 00785 } 00786 00787 #endif 00788 00789 #ifdef HAVE_OPENCL 00790 00791 static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, 00792 InputArray matC, double beta, OutputArray matD, int flags ) 00793 { 00794 int depth = matA.depth(), cn = matA.channels(); 00795 int type = CV_MAKETYPE(depth, cn); 00796 00797 CV_Assert( type == matB.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); 00798 00799 const ocl::Device & dev = ocl::Device::getDefault(); 00800 bool doubleSupport = dev.doubleFPConfig() > 0; 00801 00802 if (!doubleSupport && depth == CV_64F) 00803 return false; 00804 00805 bool haveC = matC.kind() != cv::_InputArray::NONE; 00806 Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); 00807 bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; 00808 00809 if (atrans) 00810 sizeA = Size(sizeA.height, sizeA.width); 00811 if (btrans) 00812 sizeB = Size(sizeB.height, sizeB.width); 00813 if (haveC && ctrans) 00814 sizeC = Size(sizeC.height, sizeC.width); 00815 00816 Size sizeD(sizeB.width, sizeA.height); 00817 00818 CV_Assert( !haveC || matC.type() == type ); 00819 CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); 00820 00821 int max_wg_size = (int)dev.maxWorkGroupSize(); 00822 int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; 00823 00824 matD.create(sizeD, type); 00825 00826 UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); 00827 00828 if (atrans) 00829 A = A.t(); 00830 00831 if (btrans) 00832 B = B.t(); 00833 00834 if (haveC) 00835 ctrans ? transpose(matC, D) : matC.copyTo(D); 00836 00837 int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; 00838 int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); 00839 00840 String opts = format("-D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s", 00841 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), 00842 cn, kercn, block_size, 00843 (sizeA.width % block_size !=0) ? "-D NO_MULT" : "", 00844 haveC ? "-D HAVE_C" : "", 00845 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 00846 00847 ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); 00848 if (k.empty()) 00849 return false; 00850 00851 if (depth == CV_64F) 00852 k.args(ocl::KernelArg::ReadOnlyNoSize(A), 00853 ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), 00854 ocl::KernelArg::ReadWrite(D, cn, kercn), 00855 sizeA.width, alpha, beta); 00856 else 00857 k.args(ocl::KernelArg::ReadOnlyNoSize(A), 00858 ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), 00859 ocl::KernelArg::ReadWrite(D, cn, kercn), 00860 sizeA.width, (float)alpha, (float)beta); 00861 00862 size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; 00863 size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; 00864 return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); 00865 } 00866 #endif 00867 } 00868 00869 void cv::gemm( InputArray matA, InputArray matB, double alpha, 00870 InputArray matC, double beta, OutputArray _matD, int flags ) 00871 { 00872 #ifdef HAVE_CLAMDBLAS 00873 #ifdef HAVE_OPENCL 00874 CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && 00875 matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes 00876 ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags)) 00877 #endif 00878 #endif 00879 00880 #ifdef HAVE_OPENCL 00881 CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2, 00882 ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags)) 00883 #endif 00884 00885 const int block_lin_size = 128; 00886 const int block_size = block_lin_size * block_lin_size; 00887 00888 static double zero[] = {0,0,0,0}; 00889 static float zerof[] = {0,0,0,0}; 00890 00891 Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0 ? matC.getMat() : Mat(); 00892 Size a_size = A.size(), d_size; 00893 int i, len = 0, type = A.type(); 00894 00895 CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); 00896 00897 switch( flags & (GEMM_1_T|GEMM_2_T) ) 00898 { 00899 case 0: 00900 d_size = Size( B.cols, a_size.height ); 00901 len = B.rows; 00902 CV_Assert( a_size.width == len ); 00903 break; 00904 case 1: 00905 d_size = Size( B.cols, a_size.width ); 00906 len = B.rows; 00907 CV_Assert( a_size.height == len ); 00908 break; 00909 case 2: 00910 d_size = Size( B.rows, a_size.height ); 00911 len = B.cols; 00912 CV_Assert( a_size.width == len ); 00913 break; 00914 case 3: 00915 d_size = Size( B.rows, a_size.width ); 00916 len = B.cols; 00917 CV_Assert( a_size.height == len ); 00918 break; 00919 } 00920 00921 if( !C.empty() ) 00922 { 00923 CV_Assert( C.type() == type && 00924 (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) || 00925 ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height))); 00926 } 00927 00928 _matD.create( d_size.height, d_size.width, type ); 00929 Mat D = _matD.getMat(); 00930 if( (flags & GEMM_3_T) != 0 && C.data == D.data ) 00931 { 00932 transpose( C, C ); 00933 flags &= ~GEMM_3_T; 00934 } 00935 00936 if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) ) 00937 { 00938 if( type == CV_32F ) 00939 { 00940 float* d = D.ptr<float>(); 00941 const float *a = A.ptr<float>(), 00942 *b = B.ptr<float>(), 00943 *c = (const float*)C.data; 00944 size_t d_step = D.step/sizeof(d[0]), 00945 a_step = A.step/sizeof(a[0]), 00946 b_step = B.step/sizeof(b[0]), 00947 c_step = C.data ? C.step/sizeof(c[0]) : 0; 00948 00949 if( !c ) 00950 c = zerof; 00951 00952 switch( len ) 00953 { 00954 case 2: 00955 if( len == d_size.width && b != d ) 00956 { 00957 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 00958 { 00959 float t0 = a[0]*b[0] + a[1]*b[b_step]; 00960 float t1 = a[0]*b[1] + a[1]*b[b_step+1]; 00961 d[0] = (float)(t0*alpha + c[0]*beta); 00962 d[1] = (float)(t1*alpha + c[1]*beta); 00963 } 00964 } 00965 else if( a != d ) 00966 { 00967 int c_step0 = 1; 00968 if( c == zerof ) 00969 { 00970 c_step0 = 0; 00971 c_step = 1; 00972 } 00973 00974 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 00975 { 00976 float t0 = a[0]*b[0] + a[1]*b[b_step]; 00977 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step]; 00978 d[0] = (float)(t0*alpha + c[0]*beta); 00979 d[d_step] = (float)(t1*alpha + c[c_step]*beta); 00980 } 00981 } 00982 else 00983 break; 00984 return; 00985 case 3: 00986 if( len == d_size.width && b != d ) 00987 { 00988 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 00989 { 00990 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 00991 float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1]; 00992 float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2]; 00993 d[0] = (float)(t0*alpha + c[0]*beta); 00994 d[1] = (float)(t1*alpha + c[1]*beta); 00995 d[2] = (float)(t2*alpha + c[2]*beta); 00996 } 00997 } 00998 else if( a != d ) 00999 { 01000 int c_step0 = 1; 01001 if( c == zerof ) 01002 { 01003 c_step0 = 0; 01004 c_step = 1; 01005 } 01006 01007 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 01008 { 01009 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 01010 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2]; 01011 float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2]; 01012 01013 d[0] = (float)(t0*alpha + c[0]*beta); 01014 d[d_step] = (float)(t1*alpha + c[c_step]*beta); 01015 d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta); 01016 } 01017 } 01018 else 01019 break; 01020 return; 01021 case 4: 01022 if( len == d_size.width && b != d ) 01023 { 01024 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 01025 { 01026 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 01027 float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1]; 01028 float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2]; 01029 float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3]; 01030 d[0] = (float)(t0*alpha + c[0]*beta); 01031 d[1] = (float)(t1*alpha + c[1]*beta); 01032 d[2] = (float)(t2*alpha + c[2]*beta); 01033 d[3] = (float)(t3*alpha + c[3]*beta); 01034 } 01035 } 01036 else if( len <= 16 && a != d ) 01037 { 01038 int c_step0 = 1; 01039 if( c == zerof ) 01040 { 01041 c_step0 = 0; 01042 c_step = 1; 01043 } 01044 01045 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 01046 { 01047 float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 01048 float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + 01049 a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3]; 01050 float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + 01051 a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3]; 01052 float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] + 01053 a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3]; 01054 d[0] = (float)(t0*alpha + c[0]*beta); 01055 d[d_step] = (float)(t1*alpha + c[c_step]*beta); 01056 d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta); 01057 d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta); 01058 } 01059 } 01060 else 01061 break; 01062 return; 01063 } 01064 } 01065 01066 if( type == CV_64F ) 01067 { 01068 double* d = D.ptr<double>(); 01069 const double *a = A.ptr<double>(), 01070 *b = B.ptr<double>(), 01071 *c = (const double*)C.data; 01072 size_t d_step = D.step/sizeof(d[0]), 01073 a_step = A.step/sizeof(a[0]), 01074 b_step = B.step/sizeof(b[0]), 01075 c_step = C.data ? C.step/sizeof(c[0]) : 0; 01076 if( !c ) 01077 c = zero; 01078 01079 switch( len ) 01080 { 01081 case 2: 01082 if( len == d_size.width && b != d ) 01083 { 01084 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 01085 { 01086 double t0 = a[0]*b[0] + a[1]*b[b_step]; 01087 double t1 = a[0]*b[1] + a[1]*b[b_step+1]; 01088 d[0] = t0*alpha + c[0]*beta; 01089 d[1] = t1*alpha + c[1]*beta; 01090 } 01091 } 01092 else if( a != d ) 01093 { 01094 int c_step0 = 1; 01095 if( c == zero ) 01096 { 01097 c_step0 = 0; 01098 c_step = 1; 01099 } 01100 01101 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 01102 { 01103 double t0 = a[0]*b[0] + a[1]*b[b_step]; 01104 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step]; 01105 d[0] = t0*alpha + c[0]*beta; 01106 d[d_step] = t1*alpha + c[c_step]*beta; 01107 } 01108 } 01109 else 01110 break; 01111 return; 01112 case 3: 01113 if( len == d_size.width && b != d ) 01114 { 01115 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 01116 { 01117 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 01118 double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1]; 01119 double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2]; 01120 d[0] = t0*alpha + c[0]*beta; 01121 d[1] = t1*alpha + c[1]*beta; 01122 d[2] = t2*alpha + c[2]*beta; 01123 } 01124 } 01125 else if( a != d ) 01126 { 01127 int c_step0 = 1; 01128 if( c == zero ) 01129 { 01130 c_step0 = 0; 01131 c_step = 1; 01132 } 01133 01134 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 01135 { 01136 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2]; 01137 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2]; 01138 double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2]; 01139 01140 d[0] = t0*alpha + c[0]*beta; 01141 d[d_step] = t1*alpha + c[c_step]*beta; 01142 d[d_step*2] = t2*alpha + c[c_step*2]*beta; 01143 } 01144 } 01145 else 01146 break; 01147 return; 01148 case 4: 01149 if( len == d_size.width && b != d ) 01150 { 01151 for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step ) 01152 { 01153 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 01154 double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1]; 01155 double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2]; 01156 double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3]; 01157 d[0] = t0*alpha + c[0]*beta; 01158 d[1] = t1*alpha + c[1]*beta; 01159 d[2] = t2*alpha + c[2]*beta; 01160 d[3] = t3*alpha + c[3]*beta; 01161 } 01162 } 01163 else if( d_size.width <= 16 && a != d ) 01164 { 01165 int c_step0 = 1; 01166 if( c == zero ) 01167 { 01168 c_step0 = 0; 01169 c_step = 1; 01170 } 01171 01172 for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 ) 01173 { 01174 double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3]; 01175 double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + 01176 a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3]; 01177 double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + 01178 a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3]; 01179 double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] + 01180 a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3]; 01181 d[0] = t0*alpha + c[0]*beta; 01182 d[d_step] = t1*alpha + c[c_step]*beta; 01183 d[d_step*2] = t2*alpha + c[c_step*2]*beta; 01184 d[d_step*3] = t3*alpha + c[c_step*3]*beta; 01185 } 01186 } 01187 else 01188 break; 01189 return; 01190 } 01191 } 01192 } 01193 01194 { 01195 size_t b_step = B.step; 01196 GEMMSingleMulFunc singleMulFunc; 01197 GEMMBlockMulFunc blockMulFunc; 01198 GEMMStoreFunc storeFunc; 01199 Mat *matD = &D, tmat; 01200 size_t tmat_size = 0; 01201 const uchar* Cdata = C.data; 01202 size_t Cstep = C.data ? (size_t)C.step : 0; 01203 AutoBuffer<uchar> buf; 01204 01205 if( type == CV_32FC1 ) 01206 { 01207 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f; 01208 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f; 01209 storeFunc = (GEMMStoreFunc)GEMMStore_32f; 01210 } 01211 else if( type == CV_64FC1 ) 01212 { 01213 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f; 01214 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f; 01215 storeFunc = (GEMMStoreFunc)GEMMStore_64f; 01216 } 01217 else if( type == CV_32FC2 ) 01218 { 01219 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc; 01220 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc; 01221 storeFunc = (GEMMStoreFunc)GEMMStore_32fc; 01222 } 01223 else 01224 { 01225 CV_Assert( type == CV_64FC2 ); 01226 singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc; 01227 blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc; 01228 storeFunc = (GEMMStoreFunc)GEMMStore_64fc; 01229 } 01230 01231 if( D.data == A.data || D.data == B.data ) 01232 { 01233 tmat_size = (size_t)d_size.width*d_size.height*CV_ELEM_SIZE(type); 01234 // Allocate tmat later, once the size of buf is known 01235 matD = &tmat; 01236 } 01237 01238 if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() ) 01239 { 01240 b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type); 01241 flags |= GEMM_2_T; 01242 } 01243 01244 /*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 ) 01245 { 01246 blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p : 01247 type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p : 01248 type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p : 01249 type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0; 01250 } 01251 01252 if( blas_func ) 01253 { 01254 const char* transa = flags & GEMM_1_T ? "t" : "n"; 01255 const char* transb = flags & GEMM_2_T ? "t" : "n"; 01256 int lda, ldb, ldd; 01257 01258 if( C->data.ptr ) 01259 { 01260 if( C->data.ptr != D->data.ptr ) 01261 { 01262 if( !(flags & GEMM_3_T) ) 01263 cvCopy( C, D ); 01264 else 01265 cvTranspose( C, D ); 01266 } 01267 } 01268 01269 if( CV_MAT_DEPTH(type) == CV_32F ) 01270 { 01271 Complex32f _alpha, _beta; 01272 01273 lda = A->step/sizeof(float); 01274 ldb = b_step/sizeof(float); 01275 ldd = D->step/sizeof(float); 01276 _alpha.re = (float)alpha; 01277 _alpha.im = 0; 01278 _beta.re = C->data.ptr ? (float)beta : 0; 01279 _beta.im = 0; 01280 if( CV_MAT_CN(type) == 2 ) 01281 lda /= 2, ldb /= 2, ldd /= 2; 01282 01283 blas_func( transb, transa, &d_size.width, &d_size.height, &len, 01284 &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda, 01285 &_beta, D->data.ptr, &ldd ); 01286 } 01287 else 01288 { 01289 CvComplex64f _alpha, _beta; 01290 01291 lda = A->step/sizeof(double); 01292 ldb = b_step/sizeof(double); 01293 ldd = D->step/sizeof(double); 01294 _alpha.re = alpha; 01295 _alpha.im = 0; 01296 _beta.re = C->data.ptr ? beta : 0; 01297 _beta.im = 0; 01298 if( CV_MAT_CN(type) == 2 ) 01299 lda /= 2, ldb /= 2, ldd /= 2; 01300 01301 blas_func( transb, transa, &d_size.width, &d_size.height, &len, 01302 &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda, 01303 &_beta, D->data.ptr, &ldd ); 01304 } 01305 } 01306 else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) && 01307 len <= 10000) || len <= 10 || 01308 (d_size.width <= block_lin_size && 01309 d_size.height <= block_lin_size && len <= block_lin_size) ) 01310 { 01311 if( tmat_size > 0 ) { 01312 buf.allocate(tmat_size); 01313 tmat = Mat(d_size.height, d_size.width, type, (uchar*)buf ); 01314 } 01315 singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep, 01316 matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags ); 01317 } 01318 else 01319 { 01320 int is_a_t = flags & GEMM_1_T; 01321 int is_b_t = flags & GEMM_2_T; 01322 int elem_size = CV_ELEM_SIZE(type); 01323 int dk0_1, dk0_2; 01324 size_t a_buf_size = 0, b_buf_size, d_buf_size; 01325 uchar* a_buf = 0; 01326 uchar* b_buf = 0; 01327 uchar* d_buf = 0; 01328 int j, k, di = 0, dj = 0, dk = 0; 01329 int dm0, dn0, dk0; 01330 size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1; 01331 int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0); 01332 01333 if( !is_a_t ) 01334 a_step0 = A.step, a_step1 = elem_size; 01335 else 01336 a_step0 = elem_size, a_step1 = A.step; 01337 01338 if( !is_b_t ) 01339 b_step0 = b_step, b_step1 = elem_size; 01340 else 01341 b_step0 = elem_size, b_step1 = b_step; 01342 01343 if( C.empty() ) 01344 { 01345 c_step0 = c_step1 = 0; 01346 flags &= ~GEMM_3_T; 01347 } 01348 else if( !(flags & GEMM_3_T) ) 01349 c_step0 = C.step, c_step1 = elem_size; 01350 else 01351 c_step0 = elem_size, c_step1 = C.step; 01352 01353 dm0 = std::min( block_lin_size, d_size.height ); 01354 dn0 = std::min( block_lin_size, d_size.width ); 01355 dk0_1 = block_size / dm0; 01356 dk0_2 = block_size / dn0; 01357 dk0 = std::min( dk0_1, dk0_2 ); 01358 dk0 = std::min( dk0, len ); 01359 if( dk0*dm0 > block_size ) 01360 dm0 = block_size / dk0; 01361 if( dk0*dn0 > block_size ) 01362 dn0 = block_size / dk0; 01363 01364 dk0_1 = (dn0+dn0/8+2) & -2; 01365 b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size; 01366 d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size; 01367 01368 if( is_a_t ) 01369 { 01370 a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size; 01371 flags &= ~GEMM_1_T; 01372 } 01373 01374 buf.allocate(d_buf_size + b_buf_size + a_buf_size + tmat_size); 01375 d_buf = (uchar*)buf; 01376 b_buf = d_buf + d_buf_size; 01377 01378 if( is_a_t ) 01379 a_buf = b_buf + b_buf_size; 01380 if( tmat_size > 0 ) 01381 tmat = Mat(d_size.height, d_size.width, type, b_buf + b_buf_size + a_buf_size ); 01382 01383 for( i = 0; i < d_size.height; i += di ) 01384 { 01385 di = dm0; 01386 if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height ) 01387 di = d_size.height - i; 01388 01389 for( j = 0; j < d_size.width; j += dj ) 01390 { 01391 uchar* _d = matD->ptr() + i*matD->step + j*elem_size; 01392 const uchar* _c = Cdata + i*c_step0 + j*c_step1; 01393 size_t _d_step = matD->step; 01394 dj = dn0; 01395 01396 if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width ) 01397 dj = d_size.width - j; 01398 01399 flags &= 15; 01400 if( dk0 < len ) 01401 { 01402 _d = d_buf; 01403 _d_step = dj*work_elem_size; 01404 } 01405 01406 for( k = 0; k < len; k += dk ) 01407 { 01408 const uchar* _a = A.ptr() + i*a_step0 + k*a_step1; 01409 size_t _a_step = A.step; 01410 const uchar* _b = B.ptr() + k*b_step0 + j*b_step1; 01411 size_t _b_step = b_step; 01412 Size a_bl_size; 01413 01414 dk = dk0; 01415 if( k + dk >= len || 8*(k + dk) + dk > 8*len ) 01416 dk = len - k; 01417 01418 if( !is_a_t ) 01419 a_bl_size.width = dk, a_bl_size.height = di; 01420 else 01421 a_bl_size.width = di, a_bl_size.height = dk; 01422 01423 if( a_buf && is_a_t ) 01424 { 01425 _a_step = dk*elem_size; 01426 GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size ); 01427 std::swap( a_bl_size.width, a_bl_size.height ); 01428 _a = a_buf; 01429 } 01430 01431 if( dj < d_size.width ) 01432 { 01433 Size b_size; 01434 if( !is_b_t ) 01435 b_size.width = dj, b_size.height = dk; 01436 else 01437 b_size.width = dk, b_size.height = dj; 01438 01439 _b_step = b_size.width*elem_size; 01440 GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size ); 01441 _b = b_buf; 01442 } 01443 01444 if( dk0 < len ) 01445 blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step, 01446 a_bl_size, Size(dj,di), flags ); 01447 else 01448 singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep, 01449 _d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags ); 01450 flags |= 16; 01451 } 01452 01453 if( dk0 < len ) 01454 storeFunc( _c, Cstep, _d, _d_step, 01455 matD->ptr(i) + j*elem_size, 01456 matD->step, Size(dj,di), alpha, beta, flags ); 01457 } 01458 } 01459 } 01460 01461 if( matD != &D ) 01462 matD->copyTo(D); 01463 } 01464 } 01465 01466 /****************************************************************************************\ 01467 * Transform * 01468 \****************************************************************************************/ 01469 01470 namespace cv 01471 { 01472 01473 template<typename T, typename WT> static void 01474 transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) 01475 { 01476 int x; 01477 01478 if( scn == 2 && dcn == 2 ) 01479 { 01480 for( x = 0; x < len*2; x += 2 ) 01481 { 01482 WT v0 = src[x], v1 = src[x+1]; 01483 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]); 01484 T t1 = saturate_cast<T>(m[3]*v0 + m[4]*v1 + m[5]); 01485 dst[x] = t0; dst[x+1] = t1; 01486 } 01487 } 01488 else if( scn == 3 && dcn == 3 ) 01489 { 01490 for( x = 0; x < len*3; x += 3 ) 01491 { 01492 WT v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 01493 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); 01494 T t1 = saturate_cast<T>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); 01495 T t2 = saturate_cast<T>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); 01496 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 01497 } 01498 } 01499 else if( scn == 3 && dcn == 1 ) 01500 { 01501 for( x = 0; x < len; x++, src += 3 ) 01502 dst[x] = saturate_cast<T>(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]); 01503 } 01504 else if( scn == 4 && dcn == 4 ) 01505 { 01506 for( x = 0; x < len*4; x += 4 ) 01507 { 01508 WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3]; 01509 T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]); 01510 T t1 = saturate_cast<T>(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]); 01511 dst[x] = t0; dst[x+1] = t1; 01512 t0 = saturate_cast<T>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]); 01513 t1 = saturate_cast<T>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]); 01514 dst[x+2] = t0; dst[x+3] = t1; 01515 } 01516 } 01517 else 01518 { 01519 for( x = 0; x < len; x++, src += scn, dst += dcn ) 01520 { 01521 const WT* _m = m; 01522 int j, k; 01523 for( j = 0; j < dcn; j++, _m += scn + 1 ) 01524 { 01525 WT s = _m[scn]; 01526 for( k = 0; k < scn; k++ ) 01527 s += _m[k]*src[k]; 01528 dst[j] = saturate_cast<T>(s); 01529 } 01530 } 01531 } 01532 } 01533 01534 #if CV_SSE2 01535 01536 static inline void 01537 load3x3Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3 ) 01538 { 01539 m0 = _mm_setr_ps(m[0], m[4], m[8], 0); 01540 m1 = _mm_setr_ps(m[1], m[5], m[9], 0); 01541 m2 = _mm_setr_ps(m[2], m[6], m[10], 0); 01542 m3 = _mm_setr_ps(m[3], m[7], m[11], 0); 01543 } 01544 01545 static inline void 01546 load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, __m128& m4 ) 01547 { 01548 m0 = _mm_setr_ps(m[0], m[5], m[10], m[15]); 01549 m1 = _mm_setr_ps(m[1], m[6], m[11], m[16]); 01550 m2 = _mm_setr_ps(m[2], m[7], m[12], m[17]); 01551 m3 = _mm_setr_ps(m[3], m[8], m[13], m[18]); 01552 m4 = _mm_setr_ps(m[4], m[9], m[14], m[19]); 01553 } 01554 01555 #endif 01556 01557 static void 01558 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) 01559 { 01560 #if CV_SSE2 01561 const int BITS = 10, SCALE = 1 << BITS; 01562 const float MAX_M = (float)(1 << (15 - BITS)); 01563 01564 if( USE_SSE2 && scn == 3 && dcn == 3 && 01565 std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 && 01566 std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 && 01567 std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 ) 01568 { 01569 // faster fixed-point transformation 01570 short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE), 01571 m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE), 01572 m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE), 01573 m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE), 01574 m22 = saturate_cast<short>(m[10]*SCALE); 01575 int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ), 01576 m23 = saturate_cast<int>((m[11]+0.5f)*SCALE); 01577 01578 __m128i m0 = _mm_setr_epi16(0, m00, m01, m02, m00, m01, m02, 0); 01579 __m128i m1 = _mm_setr_epi16(0, m10, m11, m12, m10, m11, m12, 0); 01580 __m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0); 01581 __m128i m3 = _mm_setr_epi32(m03, m13, m23, 0); 01582 int x = 0; 01583 01584 for( ; x <= (len - 8)*3; x += 8*3 ) 01585 { 01586 __m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1; 01587 __m128i v0 = _mm_loadl_epi64((const __m128i*)(src + x)); 01588 __m128i v1 = _mm_loadl_epi64((const __m128i*)(src + x + 8)); 01589 __m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 16)), v3; 01590 v0 = _mm_unpacklo_epi8(v0, z); // b0 g0 r0 b1 g1 r1 b2 g2 01591 v1 = _mm_unpacklo_epi8(v1, z); // r2 b3 g3 r3 b4 g4 r4 b5 01592 v2 = _mm_unpacklo_epi8(v2, z); // g5 r5 b6 g6 r6 b7 g7 r7 01593 01594 v3 = _mm_srli_si128(v2, 2); // ? b6 g6 r6 b7 g7 r7 0 01595 v2 = _mm_or_si128(_mm_slli_si128(v2, 10), _mm_srli_si128(v1, 6)); // ? b4 g4 r4 b5 g5 r5 ? 01596 v1 = _mm_or_si128(_mm_slli_si128(v1, 6), _mm_srli_si128(v0, 10)); // ? b2 g2 r2 b3 g3 r3 ? 01597 v0 = _mm_slli_si128(v0, 2); // 0 b0 g0 r0 b1 g1 r1 ? 01598 01599 // process pixels 0 & 1 01600 t0 = _mm_madd_epi16(v0, m0); // a0 b0 a1 b1 01601 t1 = _mm_madd_epi16(v0, m1); // c0 d0 c1 d1 01602 t2 = _mm_madd_epi16(v0, m2); // e0 f0 e1 f1 01603 v0 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 01604 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 01605 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 01606 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 01607 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v0, t1), _mm_unpackhi_epi64(v0,t1)), m3); // B0 G0 R0 0 01608 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B1 G1 R1 0 01609 r0 = _mm_srai_epi32(r0, BITS); 01610 r1 = _mm_srai_epi32(r1, BITS); 01611 v0 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B0 G0 R0 B1 G1 R1 0 01612 01613 // process pixels 2 & 3 01614 t0 = _mm_madd_epi16(v1, m0); // a0 b0 a1 b1 01615 t1 = _mm_madd_epi16(v1, m1); // c0 d0 c1 d1 01616 t2 = _mm_madd_epi16(v1, m2); // e0 f0 e1 f1 01617 v1 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 01618 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 01619 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 01620 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 01621 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v1, t1), _mm_unpackhi_epi64(v1,t1)), m3); // B2 G2 R2 0 01622 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B3 G3 R3 0 01623 r0 = _mm_srai_epi32(r0, BITS); 01624 r1 = _mm_srai_epi32(r1, BITS); 01625 v1 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B2 G2 R2 B3 G3 R3 0 01626 01627 // process pixels 4 & 5 01628 t0 = _mm_madd_epi16(v2, m0); // a0 b0 a1 b1 01629 t1 = _mm_madd_epi16(v2, m1); // c0 d0 c1 d1 01630 t2 = _mm_madd_epi16(v2, m2); // e0 f0 e1 f1 01631 v2 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 01632 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 01633 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 01634 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 01635 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v2, t1), _mm_unpackhi_epi64(v2,t1)), m3); // B4 G4 R4 0 01636 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B5 G5 R5 0 01637 r0 = _mm_srai_epi32(r0, BITS); 01638 r1 = _mm_srai_epi32(r1, BITS); 01639 v2 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B4 G4 R4 B5 G5 R5 0 01640 01641 // process pixels 6 & 7 01642 t0 = _mm_madd_epi16(v3, m0); // a0 b0 a1 b1 01643 t1 = _mm_madd_epi16(v3, m1); // c0 d0 c1 d1 01644 t2 = _mm_madd_epi16(v3, m2); // e0 f0 e1 f1 01645 v3 = _mm_unpacklo_epi32(t0, t1); // a0 c0 b0 d0 01646 t0 = _mm_unpackhi_epi32(t0, t1); // a1 b1 c1 d1 01647 t1 = _mm_unpacklo_epi32(t2, z); // e0 0 f0 0 01648 t2 = _mm_unpackhi_epi32(t2, z); // e1 0 f1 0 01649 r0 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(v3, t1), _mm_unpackhi_epi64(v3,t1)), m3); // B6 G6 R6 0 01650 r1 = _mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi64(t0, t2), _mm_unpackhi_epi64(t0,t2)), m3); // B7 G7 R7 0 01651 r0 = _mm_srai_epi32(r0, BITS); 01652 r1 = _mm_srai_epi32(r1, BITS); 01653 v3 = _mm_packus_epi16(_mm_packs_epi32(_mm_slli_si128(r0, 4), r1), z); // 0 B6 G6 R6 B7 G7 R7 0 01654 01655 v0 = _mm_or_si128(_mm_srli_si128(v0, 1), _mm_slli_si128(v1, 5)); 01656 v1 = _mm_or_si128(_mm_srli_si128(v1, 3), _mm_slli_si128(v2, 3)); 01657 v2 = _mm_or_si128(_mm_srli_si128(v2, 5), _mm_slli_si128(v3, 1)); 01658 _mm_storel_epi64((__m128i*)(dst + x), v0); 01659 _mm_storel_epi64((__m128i*)(dst + x + 8), v1); 01660 _mm_storel_epi64((__m128i*)(dst + x + 16), v2); 01661 } 01662 01663 for( ; x < len*3; x += 3 ) 01664 { 01665 int v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 01666 uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS); 01667 uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS); 01668 uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS); 01669 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 01670 } 01671 return; 01672 } 01673 #endif 01674 01675 transform_(src, dst, m, len, scn, dcn); 01676 } 01677 01678 static void 01679 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) 01680 { 01681 #if CV_SSE2 01682 if( USE_SSE2 && scn == 3 && dcn == 3 ) 01683 { 01684 __m128 m0, m1, m2, m3; 01685 __m128i delta = _mm_setr_epi16(0,-32768,-32768,-32768,-32768,-32768,-32768,0); 01686 load3x3Matrix(m, m0, m1, m2, m3); 01687 m3 = _mm_sub_ps(m3, _mm_setr_ps(32768.f, 32768.f, 32768.f, 0.f)); 01688 01689 int x = 0; 01690 for( ; x <= (len - 4)*3; x += 4*3 ) 01691 { 01692 __m128i z = _mm_setzero_si128(); 01693 __m128i v0 = _mm_loadu_si128((const __m128i*)(src + x)), v1; 01694 __m128i v2 = _mm_loadl_epi64((const __m128i*)(src + x + 8)), v3; 01695 v1 = _mm_unpacklo_epi16(_mm_srli_si128(v0, 6), z); // b1 g1 r1 01696 v3 = _mm_unpacklo_epi16(_mm_srli_si128(v2, 2), z); // b3 g3 r3 01697 v2 = _mm_or_si128(_mm_srli_si128(v0, 12), _mm_slli_si128(v2, 4)); 01698 v0 = _mm_unpacklo_epi16(v0, z); // b0 g0 r0 01699 v2 = _mm_unpacklo_epi16(v2, z); // b2 g2 r2 01700 __m128 x0 = _mm_cvtepi32_ps(v0), x1 = _mm_cvtepi32_ps(v1); 01701 __m128 x2 = _mm_cvtepi32_ps(v2), x3 = _mm_cvtepi32_ps(v3); 01702 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 01703 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))), 01704 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))), 01705 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3); 01706 __m128 y1 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 01707 _mm_mul_ps(m0, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(0,0,0,0))), 01708 _mm_mul_ps(m1, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(1,1,1,1)))), 01709 _mm_mul_ps(m2, _mm_shuffle_ps(x1,x1,_MM_SHUFFLE(2,2,2,2)))), m3); 01710 __m128 y2 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 01711 _mm_mul_ps(m0, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(0,0,0,0))), 01712 _mm_mul_ps(m1, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(1,1,1,1)))), 01713 _mm_mul_ps(m2, _mm_shuffle_ps(x2,x2,_MM_SHUFFLE(2,2,2,2)))), m3); 01714 __m128 y3 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 01715 _mm_mul_ps(m0, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(0,0,0,0))), 01716 _mm_mul_ps(m1, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(1,1,1,1)))), 01717 _mm_mul_ps(m2, _mm_shuffle_ps(x3,x3,_MM_SHUFFLE(2,2,2,2)))), m3); 01718 v0 = _mm_cvtps_epi32(y0); v1 = _mm_cvtps_epi32(y1); 01719 v2 = _mm_cvtps_epi32(y2); v3 = _mm_cvtps_epi32(y3); 01720 01721 v0 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v0,4), v1), delta); // 0 b0 g0 r0 b1 g1 r1 0 01722 v2 = _mm_add_epi16(_mm_packs_epi32(_mm_slli_si128(v2,4), v3), delta); // 0 b2 g2 r2 b3 g3 r3 0 01723 v1 = _mm_or_si128(_mm_srli_si128(v0,2), _mm_slli_si128(v2,10)); // b0 g0 r0 b1 g1 r1 b2 g2 01724 v2 = _mm_srli_si128(v2, 6); // r2 b3 g3 r3 0 0 0 0 01725 _mm_storeu_si128((__m128i*)(dst + x), v1); 01726 _mm_storel_epi64((__m128i*)(dst + x + 8), v2); 01727 } 01728 01729 for( ; x < len*3; x += 3 ) 01730 { 01731 float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 01732 ushort t0 = saturate_cast<ushort>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); 01733 ushort t1 = saturate_cast<ushort>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); 01734 ushort t2 = saturate_cast<ushort>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); 01735 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 01736 } 01737 return; 01738 } 01739 #endif 01740 01741 transform_(src, dst, m, len, scn, dcn); 01742 } 01743 01744 01745 static void 01746 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) 01747 { 01748 #if CV_SSE2 01749 if( USE_SSE2 ) 01750 { 01751 int x = 0; 01752 if( scn == 3 && dcn == 3 ) 01753 { 01754 __m128 m0, m1, m2, m3; 01755 load3x3Matrix(m, m0, m1, m2, m3); 01756 01757 for( ; x < (len - 1)*3; x += 3 ) 01758 { 01759 __m128 x0 = _mm_loadu_ps(src + x); 01760 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps( 01761 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))), 01762 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))), 01763 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), m3); 01764 _mm_storel_pi((__m64*)(dst + x), y0); 01765 _mm_store_ss(dst + x + 2, _mm_movehl_ps(y0,y0)); 01766 } 01767 01768 for( ; x < len*3; x += 3 ) 01769 { 01770 float v0 = src[x], v1 = src[x+1], v2 = src[x+2]; 01771 float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]); 01772 float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]); 01773 float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]); 01774 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 01775 } 01776 return; 01777 } 01778 01779 if( scn == 4 && dcn == 4 ) 01780 { 01781 __m128 m0, m1, m2, m3, m4; 01782 load4x4Matrix(m, m0, m1, m2, m3, m4); 01783 01784 for( ; x < len*4; x += 4 ) 01785 { 01786 __m128 x0 = _mm_loadu_ps(src + x); 01787 __m128 y0 = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps( 01788 _mm_mul_ps(m0, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(0,0,0,0))), 01789 _mm_mul_ps(m1, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(1,1,1,1)))), 01790 _mm_mul_ps(m2, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(2,2,2,2)))), 01791 _mm_mul_ps(m3, _mm_shuffle_ps(x0,x0,_MM_SHUFFLE(3,3,3,3)))), m4); 01792 _mm_storeu_ps(dst + x, y0); 01793 } 01794 return; 01795 } 01796 } 01797 #endif 01798 01799 transform_(src, dst, m, len, scn, dcn); 01800 } 01801 01802 01803 static void 01804 transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) 01805 { 01806 transform_(src, dst, m, len, scn, dcn); 01807 } 01808 01809 static void 01810 transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) 01811 { 01812 transform_(src, dst, m, len, scn, dcn); 01813 } 01814 01815 static void 01816 transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn) 01817 { 01818 transform_(src, dst, m, len, scn, dcn); 01819 } 01820 01821 static void 01822 transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) 01823 { 01824 transform_(src, dst, m, len, scn, dcn); 01825 } 01826 01827 template<typename T, typename WT> static void 01828 diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int ) 01829 { 01830 int x; 01831 01832 if( cn == 2 ) 01833 { 01834 for( x = 0; x < len*2; x += 2 ) 01835 { 01836 T t0 = saturate_cast<T>(m[0]*src[x] + m[2]); 01837 T t1 = saturate_cast<T>(m[4]*src[x+1] + m[5]); 01838 dst[x] = t0; dst[x+1] = t1; 01839 } 01840 } 01841 else if( cn == 3 ) 01842 { 01843 for( x = 0; x < len*3; x += 3 ) 01844 { 01845 T t0 = saturate_cast<T>(m[0]*src[x] + m[3]); 01846 T t1 = saturate_cast<T>(m[5]*src[x+1] + m[7]); 01847 T t2 = saturate_cast<T>(m[10]*src[x+2] + m[11]); 01848 dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; 01849 } 01850 } 01851 else if( cn == 4 ) 01852 { 01853 for( x = 0; x < len*4; x += 4 ) 01854 { 01855 T t0 = saturate_cast<T>(m[0]*src[x] + m[4]); 01856 T t1 = saturate_cast<T>(m[6]*src[x+1] + m[9]); 01857 dst[x] = t0; dst[x+1] = t1; 01858 t0 = saturate_cast<T>(m[12]*src[x+2] + m[14]); 01859 t1 = saturate_cast<T>(m[18]*src[x+3] + m[19]); 01860 dst[x+2] = t0; dst[x+3] = t1; 01861 } 01862 } 01863 else 01864 { 01865 for( x = 0; x < len; x++, src += cn, dst += cn ) 01866 { 01867 const WT* _m = m; 01868 for( int j = 0; j < cn; j++, _m += cn + 1 ) 01869 dst[j] = saturate_cast<T>(src[j]*_m[j] + _m[cn]); 01870 } 01871 } 01872 } 01873 01874 static void 01875 diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn) 01876 { 01877 diagtransform_(src, dst, m, len, scn, dcn); 01878 } 01879 01880 static void 01881 diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) 01882 { 01883 diagtransform_(src, dst, m, len, scn, dcn); 01884 } 01885 01886 static void 01887 diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn) 01888 { 01889 diagtransform_(src, dst, m, len, scn, dcn); 01890 } 01891 01892 static void 01893 diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) 01894 { 01895 diagtransform_(src, dst, m, len, scn, dcn); 01896 } 01897 01898 static void 01899 diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn) 01900 { 01901 diagtransform_(src, dst, m, len, scn, dcn); 01902 } 01903 01904 static void 01905 diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn) 01906 { 01907 diagtransform_(src, dst, m, len, scn, dcn); 01908 } 01909 01910 static void 01911 diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) 01912 { 01913 diagtransform_(src, dst, m, len, scn, dcn); 01914 } 01915 01916 01917 typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int ); 01918 01919 static TransformFunc getTransformFunc(int depth) 01920 { 01921 static TransformFunc transformTab[] = 01922 { 01923 (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u, 01924 (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f, 01925 (TransformFunc)transform_64f, 0 01926 }; 01927 01928 return transformTab[depth]; 01929 } 01930 01931 static TransformFunc getDiagTransformFunc(int depth) 01932 { 01933 static TransformFunc diagTransformTab[] = 01934 { 01935 (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u, 01936 (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f, 01937 (TransformFunc)diagtransform_64f, 0 01938 }; 01939 01940 return diagTransformTab[depth]; 01941 } 01942 01943 } 01944 01945 void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx ) 01946 { 01947 Mat src = _src.getMat(), m = _mtx.getMat(); 01948 int depth = src.depth(), scn = src.channels(), dcn = m.rows; 01949 CV_Assert( scn == m.cols || scn + 1 == m.cols ); 01950 bool isDiag = false; 01951 01952 _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); 01953 Mat dst = _dst.getMat(); 01954 01955 int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F; 01956 AutoBuffer<double> _mbuf; 01957 double* mbuf; 01958 01959 if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 ) 01960 { 01961 _mbuf.allocate(dcn*(scn+1)); 01962 mbuf = (double*)_mbuf; 01963 Mat tmp(dcn, scn+1, mtype, mbuf); 01964 memset(tmp.ptr(), 0, tmp.total()*tmp.elemSize()); 01965 if( m.cols == scn+1 ) 01966 m.convertTo(tmp, mtype); 01967 else 01968 { 01969 Mat tmppart = tmp.colRange(0, m.cols); 01970 m.convertTo(tmppart, mtype); 01971 } 01972 m = tmp; 01973 } 01974 else 01975 mbuf = m.ptr<double>(); 01976 01977 if( scn == dcn ) 01978 { 01979 int i, j; 01980 double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON; 01981 01982 if( scn == 1 ) 01983 { 01984 double alpha, beta; 01985 if( mtype == CV_32F ) 01986 alpha = m.at<float>(0), beta = m.at<float>(1); 01987 else 01988 alpha = m.at<double>(0), beta = m.at<double>(1); 01989 src.convertTo(dst, dst.type(), alpha, beta); 01990 return; 01991 } 01992 01993 for( i = 0, isDiag = true; isDiag && i < scn; i++ ) 01994 { 01995 for( j = 0; isDiag && j < scn; j++ ) 01996 { 01997 double v = mtype == CV_32F ? m.at<float>(i, j) : m.at<double>(i, j); 01998 if( i != j && fabs(v) > eps ) 01999 isDiag = false; 02000 } 02001 } 02002 } 02003 02004 TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth); 02005 CV_Assert( func != 0 ); 02006 02007 const Mat* arrays[] = {&src, &dst, 0}; 02008 uchar* ptrs[2]; 02009 NAryMatIterator it(arrays, ptrs); 02010 size_t i, total = it.size; 02011 02012 for( i = 0; i < it.nplanes; i++, ++it ) 02013 func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); 02014 } 02015 02016 /****************************************************************************************\ 02017 * Perspective Transform * 02018 \****************************************************************************************/ 02019 02020 namespace cv 02021 { 02022 02023 template<typename T> static void 02024 perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn ) 02025 { 02026 const double eps = FLT_EPSILON; 02027 int i; 02028 02029 if( scn == 2 && dcn == 2 ) 02030 { 02031 for( i = 0; i < len*2; i += 2 ) 02032 { 02033 T x = src[i], y = src[i + 1]; 02034 double w = x*m[6] + y*m[7] + m[8]; 02035 02036 if( fabs(w) > eps ) 02037 { 02038 w = 1./w; 02039 dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w); 02040 dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w); 02041 } 02042 else 02043 dst[i] = dst[i+1] = (T)0; 02044 } 02045 } 02046 else if( scn == 3 && dcn == 3 ) 02047 { 02048 for( i = 0; i < len*3; i += 3 ) 02049 { 02050 T x = src[i], y = src[i + 1], z = src[i + 2]; 02051 double w = x*m[12] + y*m[13] + z*m[14] + m[15]; 02052 02053 if( fabs(w) > eps ) 02054 { 02055 w = 1./w; 02056 dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w); 02057 dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w); 02058 dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w); 02059 } 02060 else 02061 dst[i] = dst[i+1] = dst[i+2] = (T)0; 02062 } 02063 } 02064 else if( scn == 3 && dcn == 2 ) 02065 { 02066 for( i = 0; i < len; i++, src += 3, dst += 2 ) 02067 { 02068 T x = src[0], y = src[1], z = src[2]; 02069 double w = x*m[8] + y*m[9] + z*m[10] + m[11]; 02070 02071 if( fabs(w) > eps ) 02072 { 02073 w = 1./w; 02074 dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w); 02075 dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w); 02076 } 02077 else 02078 dst[0] = dst[1] = (T)0; 02079 } 02080 } 02081 else 02082 { 02083 for( i = 0; i < len; i++, src += scn, dst += dcn ) 02084 { 02085 const double* _m = m + dcn*(scn + 1); 02086 double w = _m[scn]; 02087 int j, k; 02088 for( k = 0; k < scn; k++ ) 02089 w += _m[k]*src[k]; 02090 if( fabs(w) > eps ) 02091 { 02092 _m = m; 02093 for( j = 0; j < dcn; j++, _m += scn + 1 ) 02094 { 02095 double s = _m[scn]; 02096 for( k = 0; k < scn; k++ ) 02097 s += _m[k]*src[k]; 02098 dst[j] = (T)(s*w); 02099 } 02100 } 02101 else 02102 for( j = 0; j < dcn; j++ ) 02103 dst[j] = 0; 02104 } 02105 } 02106 } 02107 02108 02109 static void 02110 perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn) 02111 { 02112 perspectiveTransform_(src, dst, m, len, scn, dcn); 02113 } 02114 02115 static void 02116 perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) 02117 { 02118 perspectiveTransform_(src, dst, m, len, scn, dcn); 02119 } 02120 02121 } 02122 02123 void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx ) 02124 { 02125 Mat src = _src.getMat(), m = _mtx.getMat(); 02126 int depth = src.depth(), scn = src.channels(), dcn = m.rows-1; 02127 CV_Assert( scn + 1 == m.cols ); 02128 CV_Assert( depth == CV_32F || depth == CV_64F ); 02129 02130 _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); 02131 Mat dst = _dst.getMat(); 02132 02133 const int mtype = CV_64F; 02134 AutoBuffer<double> _mbuf; 02135 double* mbuf = _mbuf; 02136 02137 if( !m.isContinuous() || m.type() != mtype ) 02138 { 02139 _mbuf.allocate((dcn+1)*(scn+1)); 02140 Mat tmp(dcn+1, scn+1, mtype, (double*)_mbuf); 02141 m.convertTo(tmp, mtype); 02142 m = tmp; 02143 } 02144 else 02145 mbuf = m.ptr<double>(); 02146 02147 TransformFunc func = depth == CV_32F ? 02148 (TransformFunc)perspectiveTransform_32f : 02149 (TransformFunc)perspectiveTransform_64f; 02150 CV_Assert( func != 0 ); 02151 02152 const Mat* arrays[] = {&src, &dst, 0}; 02153 uchar* ptrs[2]; 02154 NAryMatIterator it(arrays, ptrs); 02155 size_t i, total = it.size; 02156 02157 for( i = 0; i < it.nplanes; i++, ++it ) 02158 func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); 02159 } 02160 02161 /****************************************************************************************\ 02162 * ScaleAdd * 02163 \****************************************************************************************/ 02164 02165 namespace cv 02166 { 02167 02168 static void scaleAdd_32f(const float* src1, const float* src2, float* dst, 02169 int len, float* _alpha) 02170 { 02171 float alpha = *_alpha; 02172 int i = 0; 02173 #if CV_SSE2 02174 if( USE_SSE2 ) 02175 { 02176 __m128 a4 = _mm_set1_ps(alpha); 02177 if( (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 ) 02178 for( ; i <= len - 8; i += 8 ) 02179 { 02180 __m128 x0, x1, y0, y1, t0, t1; 02181 x0 = _mm_load_ps(src1 + i); x1 = _mm_load_ps(src1 + i + 4); 02182 y0 = _mm_load_ps(src2 + i); y1 = _mm_load_ps(src2 + i + 4); 02183 t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0); 02184 t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1); 02185 _mm_store_ps(dst + i, t0); 02186 _mm_store_ps(dst + i + 4, t1); 02187 } 02188 else 02189 for( ; i <= len - 8; i += 8 ) 02190 { 02191 __m128 x0, x1, y0, y1, t0, t1; 02192 x0 = _mm_loadu_ps(src1 + i); x1 = _mm_loadu_ps(src1 + i + 4); 02193 y0 = _mm_loadu_ps(src2 + i); y1 = _mm_loadu_ps(src2 + i + 4); 02194 t0 = _mm_add_ps(_mm_mul_ps(x0, a4), y0); 02195 t1 = _mm_add_ps(_mm_mul_ps(x1, a4), y1); 02196 _mm_storeu_ps(dst + i, t0); 02197 _mm_storeu_ps(dst + i + 4, t1); 02198 } 02199 } 02200 else 02201 #elif CV_NEON 02202 if (true) 02203 { 02204 for ( ; i <= len - 4; i += 4) 02205 { 02206 float32x4_t v_src1 = vld1q_f32(src1 + i), v_src2 = vld1q_f32(src2 + i); 02207 vst1q_f32(dst + i, vaddq_f32(vmulq_n_f32(v_src1, alpha), v_src2)); 02208 } 02209 } 02210 else 02211 #endif 02212 //vz why do we need unroll here? 02213 for( ; i <= len - 4; i += 4 ) 02214 { 02215 float t0, t1; 02216 t0 = src1[i]*alpha + src2[i]; 02217 t1 = src1[i+1]*alpha + src2[i+1]; 02218 dst[i] = t0; dst[i+1] = t1; 02219 t0 = src1[i+2]*alpha + src2[i+2]; 02220 t1 = src1[i+3]*alpha + src2[i+3]; 02221 dst[i+2] = t0; dst[i+3] = t1; 02222 } 02223 for(; i < len; i++ ) 02224 dst[i] = src1[i]*alpha + src2[i]; 02225 } 02226 02227 02228 static void scaleAdd_64f(const double* src1, const double* src2, double* dst, 02229 int len, double* _alpha) 02230 { 02231 double alpha = *_alpha; 02232 int i = 0; 02233 #if CV_SSE2 02234 if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst) & 15) == 0 ) 02235 { 02236 __m128d a2 = _mm_set1_pd(alpha); 02237 for( ; i <= len - 4; i += 4 ) 02238 { 02239 __m128d x0, x1, y0, y1, t0, t1; 02240 x0 = _mm_load_pd(src1 + i); x1 = _mm_load_pd(src1 + i + 2); 02241 y0 = _mm_load_pd(src2 + i); y1 = _mm_load_pd(src2 + i + 2); 02242 t0 = _mm_add_pd(_mm_mul_pd(x0, a2), y0); 02243 t1 = _mm_add_pd(_mm_mul_pd(x1, a2), y1); 02244 _mm_store_pd(dst + i, t0); 02245 _mm_store_pd(dst + i + 2, t1); 02246 } 02247 } 02248 else 02249 #endif 02250 //vz why do we need unroll here? 02251 for( ; i <= len - 4; i += 4 ) 02252 { 02253 double t0, t1; 02254 t0 = src1[i]*alpha + src2[i]; 02255 t1 = src1[i+1]*alpha + src2[i+1]; 02256 dst[i] = t0; dst[i+1] = t1; 02257 t0 = src1[i+2]*alpha + src2[i+2]; 02258 t1 = src1[i+3]*alpha + src2[i+3]; 02259 dst[i+2] = t0; dst[i+3] = t1; 02260 } 02261 for(; i < len; i++ ) 02262 dst[i] = src1[i]*alpha + src2[i]; 02263 } 02264 02265 typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha); 02266 02267 #ifdef HAVE_OPENCL 02268 02269 static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) 02270 { 02271 const ocl::Device & d = ocl::Device::getDefault(); 02272 02273 bool doubleSupport = d.doubleFPConfig() > 0; 02274 Size size = _src1.size(); 02275 int depth = CV_MAT_DEPTH(type); 02276 if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) 02277 return false; 02278 02279 _dst.create(size, type); 02280 int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F); 02281 int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst), 02282 rowsPerWI = d.isIntel() ? 4 : 1; 02283 02284 char cvt[2][50]; 02285 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, 02286 format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" 02287 " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s" 02288 " -D wdepth=%d%s -D rowsPerWI=%d", 02289 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), 02290 ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), 02291 ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), 02292 ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]), 02293 ocl::typeToStr(wdepth), wdepth, 02294 doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI)); 02295 if (k.empty()) 02296 return false; 02297 02298 UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(); 02299 02300 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), 02301 src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), 02302 dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); 02303 02304 if (wdepth == CV_32F) 02305 k.args(src1arg, src2arg, dstarg, (float)alpha); 02306 else 02307 k.args(src1arg, src2arg, dstarg, alpha); 02308 02309 size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI }; 02310 return k.run(2, globalsize, NULL, false); 02311 } 02312 02313 #endif 02314 02315 } 02316 02317 void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst ) 02318 { 02319 int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 02320 CV_Assert( type == _src2.type() ); 02321 02322 #ifdef HAVE_OPENCL 02323 CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(), 02324 ocl_scaleAdd(_src1, alpha, _src2, _dst, type)) 02325 #endif 02326 02327 if( depth < CV_32F ) 02328 { 02329 addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth); 02330 return; 02331 } 02332 02333 Mat src1 = _src1.getMat(), src2 = _src2.getMat(); 02334 CV_Assert(src1.size == src2.size); 02335 02336 _dst.create(src1.dims, src1.size, type); 02337 Mat dst = _dst.getMat(); 02338 02339 float falpha = (float)alpha; 02340 void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α 02341 02342 ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; 02343 02344 if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()) 02345 { 02346 size_t len = src1.total()*cn; 02347 func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha); 02348 return; 02349 } 02350 02351 const Mat* arrays[] = {&src1, &src2, &dst, 0}; 02352 uchar* ptrs[3]; 02353 NAryMatIterator it(arrays, ptrs); 02354 size_t i, len = it.size*cn; 02355 02356 for( i = 0; i < it.nplanes; i++, ++it ) 02357 func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha ); 02358 } 02359 02360 /****************************************************************************************\ 02361 * Covariation Matrix * 02362 \****************************************************************************************/ 02363 02364 void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype ) 02365 { 02366 CV_Assert( data && nsamples > 0 ); 02367 Size size = data[0].size(); 02368 int sz = size.width * size.height, esz = (int)data[0].elemSize(); 02369 int type = data[0].type(); 02370 Mat mean; 02371 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F); 02372 02373 if( (flags & CV_COVAR_USE_AVG) != 0 ) 02374 { 02375 CV_Assert( _mean.size() == size ); 02376 if( _mean.isContinuous() && _mean.type() == ctype ) 02377 mean = _mean.reshape(1, 1); 02378 else 02379 { 02380 _mean.convertTo(mean, ctype); 02381 mean = mean.reshape(1, 1); 02382 } 02383 } 02384 02385 Mat _data(nsamples, sz, type); 02386 02387 for( int i = 0; i < nsamples; i++ ) 02388 { 02389 CV_Assert( data[i].size() == size && data[i].type() == type ); 02390 if( data[i].isContinuous() ) 02391 memcpy( _data.ptr(i), data[i].ptr(), sz*esz ); 02392 else 02393 { 02394 Mat dataRow(size.height, size.width, type, _data.ptr(i)); 02395 data[i].copyTo(dataRow); 02396 } 02397 } 02398 02399 calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); 02400 if( (flags & CV_COVAR_USE_AVG) == 0 ) 02401 _mean = mean.reshape(1, size.height); 02402 } 02403 02404 void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype ) 02405 { 02406 if(_src.kind() == _InputArray::STD_VECTOR_MAT) 02407 { 02408 std::vector<cv::Mat> src; 02409 _src.getMatVector(src); 02410 02411 CV_Assert( src.size() > 0 ); 02412 02413 Size size = src[0].size(); 02414 int type = src[0].type(); 02415 02416 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F); 02417 02418 Mat _data(static_cast<int>(src.size()), size.area(), type); 02419 02420 int i = 0; 02421 for(std::vector<cv::Mat>::iterator each = src.begin(); each != src.end(); each++, i++ ) 02422 { 02423 CV_Assert( (*each).size() == size && (*each).type() == type ); 02424 Mat dataRow(size.height, size.width, type, _data.ptr(i)); 02425 (*each).copyTo(dataRow); 02426 } 02427 02428 Mat mean; 02429 if( (flags & CV_COVAR_USE_AVG) != 0 ) 02430 { 02431 CV_Assert( _mean.size() == size ); 02432 02433 if( mean.type() != ctype ) 02434 { 02435 mean = _mean.getMat(); 02436 _mean.create(mean.size(), ctype); 02437 Mat tmp = _mean.getMat(); 02438 mean.convertTo(tmp, ctype); 02439 mean = tmp; 02440 } 02441 02442 mean = _mean.getMat().reshape(1, 1); 02443 } 02444 02445 calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype ); 02446 02447 if( (flags & CV_COVAR_USE_AVG) == 0 ) 02448 { 02449 mean = mean.reshape(1, size.height); 02450 mean.copyTo(_mean); 02451 } 02452 return; 02453 } 02454 02455 Mat data = _src.getMat(), mean; 02456 CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) ); 02457 bool takeRows = (flags & CV_COVAR_ROWS) != 0; 02458 int type = data.type(); 02459 int nsamples = takeRows ? data.rows : data.cols; 02460 CV_Assert( nsamples > 0 ); 02461 Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows); 02462 02463 if( (flags & CV_COVAR_USE_AVG) != 0 ) 02464 { 02465 mean = _mean.getMat(); 02466 ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F); 02467 CV_Assert( mean.size() == size ); 02468 if( mean.type() != ctype ) 02469 { 02470 _mean.create(mean.size(), ctype); 02471 Mat tmp = _mean.getMat(); 02472 mean.convertTo(tmp, ctype); 02473 mean = tmp; 02474 } 02475 } 02476 else 02477 { 02478 ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F); 02479 reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype ); 02480 mean = _mean.getMat(); 02481 } 02482 02483 mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows, 02484 mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype ); 02485 } 02486 02487 /****************************************************************************************\ 02488 * Mahalanobis * 02489 \****************************************************************************************/ 02490 02491 double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar ) 02492 { 02493 Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat(); 02494 int type = v1.type(), depth = v1.depth(); 02495 Size sz = v1.size(); 02496 int i, j, len = sz.width*sz.height*v1.channels(); 02497 AutoBuffer<double> buf(len); 02498 double result = 0; 02499 02500 CV_Assert( type == v2.type() && type == icovar.type() && 02501 sz == v2.size() && len == icovar.rows && len == icovar.cols ); 02502 02503 sz.width *= v1.channels(); 02504 if( v1.isContinuous() && v2.isContinuous() ) 02505 { 02506 sz.width *= sz.height; 02507 sz.height = 1; 02508 } 02509 02510 if( depth == CV_32F ) 02511 { 02512 const float* src1 = v1.ptr<float>(); 02513 const float* src2 = v2.ptr<float>(); 02514 size_t step1 = v1.step/sizeof(src1[0]); 02515 size_t step2 = v2.step/sizeof(src2[0]); 02516 double* diff = buf; 02517 const float* mat = icovar.ptr<float>(); 02518 size_t matstep = icovar.step/sizeof(mat[0]); 02519 02520 for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) 02521 { 02522 for( i = 0; i < sz.width; i++ ) 02523 diff[i] = src1[i] - src2[i]; 02524 } 02525 02526 diff = buf; 02527 for( i = 0; i < len; i++, mat += matstep ) 02528 { 02529 double row_sum = 0; 02530 j = 0; 02531 #if CV_ENABLE_UNROLLED 02532 for(; j <= len - 4; j += 4 ) 02533 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + 02534 diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; 02535 #endif 02536 for( ; j < len; j++ ) 02537 row_sum += diff[j]*mat[j]; 02538 result += row_sum * diff[i]; 02539 } 02540 } 02541 else if( depth == CV_64F ) 02542 { 02543 const double* src1 = v1.ptr<double>(); 02544 const double* src2 = v2.ptr<double>(); 02545 size_t step1 = v1.step/sizeof(src1[0]); 02546 size_t step2 = v2.step/sizeof(src2[0]); 02547 double* diff = buf; 02548 const double* mat = icovar.ptr<double>(); 02549 size_t matstep = icovar.step/sizeof(mat[0]); 02550 02551 for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width ) 02552 { 02553 for( i = 0; i < sz.width; i++ ) 02554 diff[i] = src1[i] - src2[i]; 02555 } 02556 02557 diff = buf; 02558 for( i = 0; i < len; i++, mat += matstep ) 02559 { 02560 double row_sum = 0; 02561 j = 0; 02562 #if CV_ENABLE_UNROLLED 02563 for(; j <= len - 4; j += 4 ) 02564 row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] + 02565 diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3]; 02566 #endif 02567 for( ; j < len; j++ ) 02568 row_sum += diff[j]*mat[j]; 02569 result += row_sum * diff[i]; 02570 } 02571 } 02572 else 02573 CV_Error( CV_StsUnsupportedFormat, "" ); 02574 02575 return std::sqrt(result); 02576 } 02577 02578 /****************************************************************************************\ 02579 * MulTransposed * 02580 \****************************************************************************************/ 02581 02582 namespace cv 02583 { 02584 02585 template<typename sT, typename dT> static void 02586 MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) 02587 { 02588 int i, j, k; 02589 const sT* src = srcmat.ptr<sT>(); 02590 dT* dst = dstmat.ptr<dT>(); 02591 const dT* delta = deltamat.ptr<dT>(); 02592 size_t srcstep = srcmat.step/sizeof(src[0]); 02593 size_t dststep = dstmat.step/sizeof(dst[0]); 02594 size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0; 02595 int delta_cols = deltamat.cols; 02596 Size size = srcmat.size(); 02597 dT* tdst = dst; 02598 dT* col_buf = 0; 02599 dT* delta_buf = 0; 02600 int buf_size = size.height*sizeof(dT); 02601 AutoBuffer<uchar> buf; 02602 02603 if( delta && delta_cols < size.width ) 02604 { 02605 assert( delta_cols == 1 ); 02606 buf_size *= 5; 02607 } 02608 buf.allocate(buf_size); 02609 col_buf = (dT*)(uchar*)buf; 02610 02611 if( delta && delta_cols < size.width ) 02612 { 02613 delta_buf = col_buf + size.height; 02614 for( i = 0; i < size.height; i++ ) 02615 delta_buf[i*4] = delta_buf[i*4+1] = 02616 delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep]; 02617 delta = delta_buf; 02618 deltastep = deltastep ? 4 : 0; 02619 } 02620 02621 if( !delta ) 02622 for( i = 0; i < size.width; i++, tdst += dststep ) 02623 { 02624 for( k = 0; k < size.height; k++ ) 02625 col_buf[k] = src[k*srcstep+i]; 02626 02627 for( j = i; j <= size.width - 4; j += 4 ) 02628 { 02629 double s0 = 0, s1 = 0, s2 = 0, s3 = 0; 02630 const sT *tsrc = src + j; 02631 02632 for( k = 0; k < size.height; k++, tsrc += srcstep ) 02633 { 02634 double a = col_buf[k]; 02635 s0 += a * tsrc[0]; 02636 s1 += a * tsrc[1]; 02637 s2 += a * tsrc[2]; 02638 s3 += a * tsrc[3]; 02639 } 02640 02641 tdst[j] = (dT)(s0*scale); 02642 tdst[j+1] = (dT)(s1*scale); 02643 tdst[j+2] = (dT)(s2*scale); 02644 tdst[j+3] = (dT)(s3*scale); 02645 } 02646 02647 for( ; j < size.width; j++ ) 02648 { 02649 double s0 = 0; 02650 const sT *tsrc = src + j; 02651 02652 for( k = 0; k < size.height; k++, tsrc += srcstep ) 02653 s0 += (double)col_buf[k] * tsrc[0]; 02654 02655 tdst[j] = (dT)(s0*scale); 02656 } 02657 } 02658 else 02659 for( i = 0; i < size.width; i++, tdst += dststep ) 02660 { 02661 if( !delta_buf ) 02662 for( k = 0; k < size.height; k++ ) 02663 col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i]; 02664 else 02665 for( k = 0; k < size.height; k++ ) 02666 col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep]; 02667 02668 for( j = i; j <= size.width - 4; j += 4 ) 02669 { 02670 double s0 = 0, s1 = 0, s2 = 0, s3 = 0; 02671 const sT *tsrc = src + j; 02672 const dT *d = delta_buf ? delta_buf : delta + j; 02673 02674 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) 02675 { 02676 double a = col_buf[k]; 02677 s0 += a * (tsrc[0] - d[0]); 02678 s1 += a * (tsrc[1] - d[1]); 02679 s2 += a * (tsrc[2] - d[2]); 02680 s3 += a * (tsrc[3] - d[3]); 02681 } 02682 02683 tdst[j] = (dT)(s0*scale); 02684 tdst[j+1] = (dT)(s1*scale); 02685 tdst[j+2] = (dT)(s2*scale); 02686 tdst[j+3] = (dT)(s3*scale); 02687 } 02688 02689 for( ; j < size.width; j++ ) 02690 { 02691 double s0 = 0; 02692 const sT *tsrc = src + j; 02693 const dT *d = delta_buf ? delta_buf : delta + j; 02694 02695 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) 02696 s0 += (double)col_buf[k] * (tsrc[0] - d[0]); 02697 02698 tdst[j] = (dT)(s0*scale); 02699 } 02700 } 02701 } 02702 02703 02704 template<typename sT, typename dT> static void 02705 MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale ) 02706 { 02707 int i, j, k; 02708 const sT* src = srcmat.ptr<sT>(); 02709 dT* dst = dstmat.ptr<dT>(); 02710 const dT* delta = deltamat.ptr<dT>(); 02711 size_t srcstep = srcmat.step/sizeof(src[0]); 02712 size_t dststep = dstmat.step/sizeof(dst[0]); 02713 size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0; 02714 int delta_cols = deltamat.cols; 02715 Size size = srcmat.size(); 02716 dT* tdst = dst; 02717 02718 if( !delta ) 02719 for( i = 0; i < size.height; i++, tdst += dststep ) 02720 for( j = i; j < size.height; j++ ) 02721 { 02722 double s = 0; 02723 const sT *tsrc1 = src + i*srcstep; 02724 const sT *tsrc2 = src + j*srcstep; 02725 02726 for( k = 0; k <= size.width - 4; k += 4 ) 02727 s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] + 02728 (double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3]; 02729 for( ; k < size.width; k++ ) 02730 s += (double)tsrc1[k] * tsrc2[k]; 02731 tdst[j] = (dT)(s*scale); 02732 } 02733 else 02734 { 02735 dT delta_buf[4]; 02736 int delta_shift = delta_cols == size.width ? 4 : 0; 02737 AutoBuffer<uchar> buf(size.width*sizeof(dT)); 02738 dT* row_buf = (dT*)(uchar*)buf; 02739 02740 for( i = 0; i < size.height; i++, tdst += dststep ) 02741 { 02742 const sT *tsrc1 = src + i*srcstep; 02743 const dT *tdelta1 = delta + i*deltastep; 02744 02745 if( delta_cols < size.width ) 02746 for( k = 0; k < size.width; k++ ) 02747 row_buf[k] = tsrc1[k] - tdelta1[0]; 02748 else 02749 for( k = 0; k < size.width; k++ ) 02750 row_buf[k] = tsrc1[k] - tdelta1[k]; 02751 02752 for( j = i; j < size.height; j++ ) 02753 { 02754 double s = 0; 02755 const sT *tsrc2 = src + j*srcstep; 02756 const dT *tdelta2 = delta + j*deltastep; 02757 if( delta_cols < size.width ) 02758 { 02759 delta_buf[0] = delta_buf[1] = 02760 delta_buf[2] = delta_buf[3] = tdelta2[0]; 02761 tdelta2 = delta_buf; 02762 } 02763 for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift ) 02764 s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) + 02765 (double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) + 02766 (double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) + 02767 (double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]); 02768 for( ; k < size.width; k++, tdelta2++ ) 02769 s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]); 02770 tdst[j] = (dT)(s*scale); 02771 } 02772 } 02773 } 02774 } 02775 02776 typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale); 02777 02778 } 02779 02780 void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata, 02781 InputArray _delta, double scale, int dtype ) 02782 { 02783 Mat src = _src.getMat(), delta = _delta.getMat(); 02784 const int gemm_level = 100; // boundary above which GEMM is faster. 02785 int stype = src.type(); 02786 dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F); 02787 CV_Assert( src.channels() == 1 ); 02788 02789 if( !delta.empty() ) 02790 { 02791 CV_Assert( delta.channels() == 1 && 02792 (delta.rows == src.rows || delta.rows == 1) && 02793 (delta.cols == src.cols || delta.cols == 1)); 02794 if( delta.type() != dtype ) 02795 delta.convertTo(delta, dtype); 02796 } 02797 02798 int dsize = ata ? src.cols : src.rows; 02799 _dst.create( dsize, dsize, dtype ); 02800 Mat dst = _dst.getMat(); 02801 02802 if( src.data == dst.data || (stype == dtype && 02803 (dst.cols >= gemm_level && dst.rows >= gemm_level && 02804 src.cols >= gemm_level && src.rows >= gemm_level))) 02805 { 02806 Mat src2; 02807 const Mat* tsrc = &src; 02808 if( !delta.empty() ) 02809 { 02810 if( delta.size() == src.size() ) 02811 subtract( src, delta, src2 ); 02812 else 02813 { 02814 repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2); 02815 subtract( src, src2, src2 ); 02816 } 02817 tsrc = &src2; 02818 } 02819 gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T ); 02820 } 02821 else 02822 { 02823 MulTransposedFunc func = 0; 02824 if(stype == CV_8U && dtype == CV_32F) 02825 { 02826 if(ata) 02827 func = MulTransposedR<uchar,float>; 02828 else 02829 func = MulTransposedL<uchar,float>; 02830 } 02831 else if(stype == CV_8U && dtype == CV_64F) 02832 { 02833 if(ata) 02834 func = MulTransposedR<uchar,double>; 02835 else 02836 func = MulTransposedL<uchar,double>; 02837 } 02838 else if(stype == CV_16U && dtype == CV_32F) 02839 { 02840 if(ata) 02841 func = MulTransposedR<ushort,float>; 02842 else 02843 func = MulTransposedL<ushort,float>; 02844 } 02845 else if(stype == CV_16U && dtype == CV_64F) 02846 { 02847 if(ata) 02848 func = MulTransposedR<ushort,double>; 02849 else 02850 func = MulTransposedL<ushort,double>; 02851 } 02852 else if(stype == CV_16S && dtype == CV_32F) 02853 { 02854 if(ata) 02855 func = MulTransposedR<short,float>; 02856 else 02857 func = MulTransposedL<short,float>; 02858 } 02859 else if(stype == CV_16S && dtype == CV_64F) 02860 { 02861 if(ata) 02862 func = MulTransposedR<short,double>; 02863 else 02864 func = MulTransposedL<short,double>; 02865 } 02866 else if(stype == CV_32F && dtype == CV_32F) 02867 { 02868 if(ata) 02869 func = MulTransposedR<float,float>; 02870 else 02871 func = MulTransposedL<float,float>; 02872 } 02873 else if(stype == CV_32F && dtype == CV_64F) 02874 { 02875 if(ata) 02876 func = MulTransposedR<float,double>; 02877 else 02878 func = MulTransposedL<float,double>; 02879 } 02880 else if(stype == CV_64F && dtype == CV_64F) 02881 { 02882 if(ata) 02883 func = MulTransposedR<double,double>; 02884 else 02885 func = MulTransposedL<double,double>; 02886 } 02887 if( !func ) 02888 CV_Error( CV_StsUnsupportedFormat, "" ); 02889 02890 func( src, dst, delta, scale ); 02891 completeSymm( dst, false ); 02892 } 02893 } 02894 02895 /****************************************************************************************\ 02896 * Dot Product * 02897 \****************************************************************************************/ 02898 02899 namespace cv 02900 { 02901 02902 template<typename T> double 02903 dotProd_(const T* src1, const T* src2, int len) 02904 { 02905 int i = 0; 02906 double result = 0; 02907 02908 #if CV_ENABLE_UNROLLED 02909 for( ; i <= len - 4; i += 4 ) 02910 result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] + 02911 (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; 02912 #endif 02913 for( ; i < len; i++ ) 02914 result += (double)src1[i]*src2[i]; 02915 02916 return result; 02917 } 02918 02919 02920 static double dotProd_8u(const uchar* src1, const uchar* src2, int len) 02921 { 02922 double r = 0; 02923 #if ARITHM_USE_IPP && IPP_DISABLE_BLOCK 02924 CV_IPP_CHECK() 02925 { 02926 if (0 <= ippiDotProd_8u64f_C1R(src1, (int)(len*sizeof(src1[0])), 02927 src2, (int)(len*sizeof(src2[0])), 02928 ippiSize(len, 1), &r)) 02929 { 02930 CV_IMPL_ADD(CV_IMPL_IPP); 02931 return r; 02932 } 02933 setIppErrorStatus(); 02934 } 02935 #endif 02936 int i = 0; 02937 02938 #if CV_SSE2 02939 if( USE_SSE2 ) 02940 { 02941 int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize; 02942 __m128i z = _mm_setzero_si128(); 02943 CV_DECL_ALIGNED(16) int buf[4]; 02944 02945 while( i < len0 ) 02946 { 02947 blockSize = std::min(len0 - i, blockSize0); 02948 __m128i s = z; 02949 j = 0; 02950 for( ; j <= blockSize - 16; j += 16 ) 02951 { 02952 __m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j)); 02953 __m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j)); 02954 __m128i s0, s1, s2, s3; 02955 s0 = _mm_unpacklo_epi8(b0, z); 02956 s2 = _mm_unpackhi_epi8(b0, z); 02957 s1 = _mm_unpacklo_epi8(b1, z); 02958 s3 = _mm_unpackhi_epi8(b1, z); 02959 s0 = _mm_madd_epi16(s0, s1); 02960 s2 = _mm_madd_epi16(s2, s3); 02961 s = _mm_add_epi32(s, s0); 02962 s = _mm_add_epi32(s, s2); 02963 } 02964 02965 for( ; j < blockSize; j += 4 ) 02966 { 02967 __m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z); 02968 __m128i s1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src2 + j)), z); 02969 s0 = _mm_madd_epi16(s0, s1); 02970 s = _mm_add_epi32(s, s0); 02971 } 02972 02973 _mm_store_si128((__m128i*)buf, s); 02974 r += buf[0] + buf[1] + buf[2] + buf[3]; 02975 02976 src1 += blockSize; 02977 src2 += blockSize; 02978 i += blockSize; 02979 } 02980 } 02981 #elif CV_NEON 02982 int len0 = len & -8, blockSize0 = (1 << 15), blockSize; 02983 uint32x4_t v_zero = vdupq_n_u32(0u); 02984 CV_DECL_ALIGNED(16) uint buf[4]; 02985 02986 while( i < len0 ) 02987 { 02988 blockSize = std::min(len0 - i, blockSize0); 02989 uint32x4_t v_sum = v_zero; 02990 02991 int j = 0; 02992 for( ; j <= blockSize - 16; j += 16 ) 02993 { 02994 uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j); 02995 02996 uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2)); 02997 v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); 02998 v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); 02999 03000 v_src10 = vmovl_u8(vget_high_u8(v_src1)); 03001 v_src20 = vmovl_u8(vget_high_u8(v_src2)); 03002 v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); 03003 v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); 03004 } 03005 03006 for( ; j <= blockSize - 8; j += 8 ) 03007 { 03008 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j)); 03009 v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2)); 03010 v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2)); 03011 } 03012 03013 vst1q_u32(buf, v_sum); 03014 r += buf[0] + buf[1] + buf[2] + buf[3]; 03015 03016 src1 += blockSize; 03017 src2 += blockSize; 03018 i += blockSize; 03019 } 03020 #endif 03021 return r + dotProd_(src1, src2, len - i); 03022 } 03023 03024 03025 static double dotProd_8s(const schar* src1, const schar* src2, int len) 03026 { 03027 int i = 0; 03028 double r = 0.0; 03029 03030 #if CV_SSE2 03031 if( USE_SSE2 ) 03032 { 03033 int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize; 03034 __m128i z = _mm_setzero_si128(); 03035 CV_DECL_ALIGNED(16) int buf[4]; 03036 03037 while( i < len0 ) 03038 { 03039 blockSize = std::min(len0 - i, blockSize0); 03040 __m128i s = z; 03041 j = 0; 03042 for( ; j <= blockSize - 16; j += 16 ) 03043 { 03044 __m128i b0 = _mm_loadu_si128((const __m128i*)(src1 + j)); 03045 __m128i b1 = _mm_loadu_si128((const __m128i*)(src2 + j)); 03046 __m128i s0, s1, s2, s3; 03047 s0 = _mm_srai_epi16(_mm_unpacklo_epi8(b0, b0), 8); 03048 s2 = _mm_srai_epi16(_mm_unpackhi_epi8(b0, b0), 8); 03049 s1 = _mm_srai_epi16(_mm_unpacklo_epi8(b1, b1), 8); 03050 s3 = _mm_srai_epi16(_mm_unpackhi_epi8(b1, b1), 8); 03051 s0 = _mm_madd_epi16(s0, s1); 03052 s2 = _mm_madd_epi16(s2, s3); 03053 s = _mm_add_epi32(s, s0); 03054 s = _mm_add_epi32(s, s2); 03055 } 03056 03057 for( ; j < blockSize; j += 4 ) 03058 { 03059 __m128i s0 = _mm_cvtsi32_si128(*(const int*)(src1 + j)); 03060 __m128i s1 = _mm_cvtsi32_si128(*(const int*)(src2 + j)); 03061 s0 = _mm_srai_epi16(_mm_unpacklo_epi8(s0, s0), 8); 03062 s1 = _mm_srai_epi16(_mm_unpacklo_epi8(s1, s1), 8); 03063 s0 = _mm_madd_epi16(s0, s1); 03064 s = _mm_add_epi32(s, s0); 03065 } 03066 03067 _mm_store_si128((__m128i*)buf, s); 03068 r += buf[0] + buf[1] + buf[2] + buf[3]; 03069 03070 src1 += blockSize; 03071 src2 += blockSize; 03072 i += blockSize; 03073 } 03074 } 03075 #elif CV_NEON 03076 int len0 = len & -8, blockSize0 = (1 << 14), blockSize; 03077 int32x4_t v_zero = vdupq_n_s32(0); 03078 CV_DECL_ALIGNED(16) int buf[4]; 03079 03080 while( i < len0 ) 03081 { 03082 blockSize = std::min(len0 - i, blockSize0); 03083 int32x4_t v_sum = v_zero; 03084 03085 int j = 0; 03086 for( ; j <= blockSize - 16; j += 16 ) 03087 { 03088 int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j); 03089 03090 int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2)); 03091 v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); 03092 v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); 03093 03094 v_src10 = vmovl_s8(vget_high_s8(v_src1)); 03095 v_src20 = vmovl_s8(vget_high_s8(v_src2)); 03096 v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); 03097 v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); 03098 } 03099 03100 for( ; j <= blockSize - 8; j += 8 ) 03101 { 03102 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j)); 03103 v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2)); 03104 v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2)); 03105 } 03106 03107 vst1q_s32(buf, v_sum); 03108 r += buf[0] + buf[1] + buf[2] + buf[3]; 03109 03110 src1 += blockSize; 03111 src2 += blockSize; 03112 i += blockSize; 03113 } 03114 #endif 03115 03116 return r + dotProd_(src1, src2, len - i); 03117 } 03118 03119 static double dotProd_16u(const ushort* src1, const ushort* src2, int len) 03120 { 03121 #if (ARITHM_USE_IPP == 1) 03122 CV_IPP_CHECK() 03123 { 03124 double r = 0; 03125 if (0 <= ippiDotProd_16u64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r)) 03126 { 03127 CV_IMPL_ADD(CV_IMPL_IPP); 03128 return r; 03129 } 03130 setIppErrorStatus(); 03131 } 03132 #endif 03133 return dotProd_(src1, src2, len); 03134 } 03135 03136 static double dotProd_16s(const short* src1, const short* src2, int len) 03137 { 03138 #if (ARITHM_USE_IPP == 1) && (IPP_VERSION_X100 != 900) // bug in IPP 9.0.0 03139 CV_IPP_CHECK() 03140 { 03141 double r = 0; 03142 if (0 <= ippiDotProd_16s64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r)) 03143 { 03144 CV_IMPL_ADD(CV_IMPL_IPP); 03145 return r; 03146 } 03147 setIppErrorStatus(); 03148 } 03149 #endif 03150 return dotProd_(src1, src2, len); 03151 } 03152 03153 static double dotProd_32s(const int* src1, const int* src2, int len) 03154 { 03155 #if (ARITHM_USE_IPP == 1) 03156 CV_IPP_CHECK() 03157 { 03158 double r = 0; 03159 if (0 <= ippiDotProd_32s64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r)) 03160 { 03161 CV_IMPL_ADD(CV_IMPL_IPP); 03162 return r; 03163 } 03164 setIppErrorStatus(); 03165 } 03166 #endif 03167 return dotProd_(src1, src2, len); 03168 } 03169 03170 static double dotProd_32f(const float* src1, const float* src2, int len) 03171 { 03172 double r = 0.0; 03173 int i = 0; 03174 03175 #if (ARITHM_USE_IPP == 1) 03176 CV_IPP_CHECK() 03177 { 03178 if (0 <= ippsDotProd_32f64f(src1, src2, len, &r)) 03179 { 03180 CV_IMPL_ADD(CV_IMPL_IPP); 03181 return r; 03182 } 03183 setIppErrorStatus(); 03184 } 03185 #elif CV_NEON 03186 int len0 = len & -4, blockSize0 = (1 << 13), blockSize; 03187 float32x4_t v_zero = vdupq_n_f32(0.0f); 03188 CV_DECL_ALIGNED(16) float buf[4]; 03189 03190 while( i < len0 ) 03191 { 03192 blockSize = std::min(len0 - i, blockSize0); 03193 float32x4_t v_sum = v_zero; 03194 03195 int j = 0; 03196 for( ; j <= blockSize - 4; j += 4 ) 03197 v_sum = vmlaq_f32(v_sum, vld1q_f32(src1 + j), vld1q_f32(src2 + j)); 03198 03199 vst1q_f32(buf, v_sum); 03200 r += buf[0] + buf[1] + buf[2] + buf[3]; 03201 03202 src1 += blockSize; 03203 src2 += blockSize; 03204 i += blockSize; 03205 } 03206 #endif 03207 return r + dotProd_(src1, src2, len - i); 03208 } 03209 03210 static double dotProd_64f(const double* src1, const double* src2, int len) 03211 { 03212 #if (ARITHM_USE_IPP == 1) 03213 CV_IPP_CHECK() 03214 { 03215 double r = 0; 03216 if (0 <= ippsDotProd_64f(src1, src2, len, &r)) 03217 { 03218 CV_IMPL_ADD(CV_IMPL_IPP); 03219 return r; 03220 } 03221 setIppErrorStatus(); 03222 } 03223 #endif 03224 return dotProd_(src1, src2, len); 03225 } 03226 03227 03228 typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len); 03229 03230 static DotProdFunc getDotProdFunc(int depth) 03231 { 03232 static DotProdFunc dotProdTab[] = 03233 { 03234 (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s), 03235 (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s, 03236 (DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f), 03237 (DotProdFunc)dotProd_64f, 0 03238 }; 03239 03240 return dotProdTab[depth]; 03241 } 03242 03243 double Mat::dot(InputArray _mat) const 03244 { 03245 Mat mat = _mat.getMat(); 03246 int cn = channels(); 03247 DotProdFunc func = getDotProdFunc(depth()); 03248 CV_Assert( mat.type() == type() && mat.size == size && func != 0 ); 03249 03250 if( isContinuous() && mat.isContinuous() ) 03251 { 03252 size_t len = total()*cn; 03253 if( len == (size_t)(int)len ) 03254 return func(data, mat.data, (int)len); 03255 } 03256 03257 const Mat* arrays[] = {this, &mat, 0}; 03258 uchar* ptrs[2]; 03259 NAryMatIterator it(arrays, ptrs); 03260 int len = (int)(it.size*cn); 03261 double r = 0; 03262 03263 for( size_t i = 0; i < it.nplanes; i++, ++it ) 03264 r += func( ptrs[0], ptrs[1], len ); 03265 03266 return r; 03267 } 03268 03269 } 03270 03271 /****************************************************************************************\ 03272 * Earlier API * 03273 \****************************************************************************************/ 03274 03275 CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha, 03276 const CvArr* Carr, double beta, CvArr* Darr, int flags ) 03277 { 03278 cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr); 03279 cv::Mat C, D = cv::cvarrToMat(Darr); 03280 03281 if( Carr ) 03282 C = cv::cvarrToMat(Carr); 03283 03284 CV_Assert( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)) && 03285 (D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)) && 03286 D.type() == A.type() ); 03287 03288 gemm( A, B, alpha, C, beta, D, flags ); 03289 } 03290 03291 03292 CV_IMPL void 03293 cvTransform( const CvArr* srcarr, CvArr* dstarr, 03294 const CvMat* transmat, const CvMat* shiftvec ) 03295 { 03296 cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 03297 03298 if( shiftvec ) 03299 { 03300 cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows), 03301 _m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols); 03302 m.convertTo(m1, m1.type()); 03303 v.convertTo(v1, v1.type()); 03304 m = _m; 03305 } 03306 03307 CV_Assert( dst.depth() == src.depth() && dst.channels() == m.rows ); 03308 cv::transform( src, dst, m ); 03309 } 03310 03311 03312 CV_IMPL void 03313 cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat ) 03314 { 03315 cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 03316 03317 CV_Assert( dst.type() == src.type() && dst.channels() == m.rows-1 ); 03318 cv::perspectiveTransform( src, dst, m ); 03319 } 03320 03321 03322 CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale, 03323 const CvArr* srcarr2, CvArr* dstarr ) 03324 { 03325 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr); 03326 03327 CV_Assert( src1.size == dst.size && src1.type() == dst.type() ); 03328 cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst ); 03329 } 03330 03331 03332 CV_IMPL void 03333 cvCalcCovarMatrix( const CvArr** vecarr, int count, 03334 CvArr* covarr, CvArr* avgarr, int flags ) 03335 { 03336 cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean; 03337 CV_Assert( vecarr != 0 && count >= 1 ); 03338 03339 if( avgarr ) 03340 mean = mean0 = cv::cvarrToMat(avgarr); 03341 03342 if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 ) 03343 { 03344 03345 cv::Mat data = cv::cvarrToMat(vecarr[0]); 03346 cv::calcCovarMatrix( data, cov, mean, flags, cov.type() ); 03347 } 03348 else 03349 { 03350 std::vector<cv::Mat> data(count); 03351 for( int i = 0; i < count; i++ ) 03352 data[i] = cv::cvarrToMat(vecarr[i]); 03353 cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() ); 03354 } 03355 03356 if( mean.data != mean0.data && mean0.data ) 03357 mean.convertTo(mean0, mean0.type()); 03358 03359 if( cov.data != cov0.data ) 03360 cov.convertTo(cov0, cov0.type()); 03361 } 03362 03363 03364 CV_IMPL double 03365 cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr ) 03366 { 03367 return cv::Mahalanobis(cv::cvarrToMat(srcAarr), 03368 cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr)); 03369 } 03370 03371 CV_IMPL void 03372 cvMulTransposed( const CvArr* srcarr, CvArr* dstarr, 03373 int order, const CvArr* deltaarr, double scale ) 03374 { 03375 cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta; 03376 if( deltaarr ) 03377 delta = cv::cvarrToMat(deltaarr); 03378 cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type()); 03379 if( dst.data != dst0.data ) 03380 dst.convertTo(dst0, dst0.type()); 03381 } 03382 03383 CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr ) 03384 { 03385 return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr)); 03386 } 03387 03388 03389 CV_IMPL void 03390 cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags ) 03391 { 03392 cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr); 03393 cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects); 03394 cv::Mat mean = mean0, evals = evals0, evects = evects0; 03395 03396 cv::PCA pca; 03397 pca.mean = mean; 03398 pca.eigenvalues = evals; 03399 pca.eigenvectors = evects; 03400 03401 pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(), 03402 flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0); 03403 03404 if( pca.mean.size() == mean.size() ) 03405 pca.mean.convertTo( mean, mean.type() ); 03406 else 03407 { 03408 cv::Mat temp; pca.mean.convertTo( temp, mean.type() ); 03409 transpose( temp, mean ); 03410 } 03411 03412 evals = pca.eigenvalues; 03413 evects = pca.eigenvectors; 03414 int ecount0 = evals0.cols + evals0.rows - 1; 03415 int ecount = evals.cols + evals.rows - 1; 03416 03417 CV_Assert( (evals0.cols == 1 || evals0.rows == 1) && 03418 ecount0 <= ecount && 03419 evects0.cols == evects.cols && 03420 evects0.rows == ecount0 ); 03421 03422 cv::Mat temp = evals0; 03423 if( evals.rows == 1 ) 03424 evals.colRange(0, ecount0).convertTo(temp, evals0.type()); 03425 else 03426 evals.rowRange(0, ecount0).convertTo(temp, evals0.type()); 03427 if( temp.data != evals0.data ) 03428 transpose(temp, evals0); 03429 evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() ); 03430 03431 // otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated 03432 CV_Assert( mean0.data == mean.data ); 03433 } 03434 03435 03436 CV_IMPL void 03437 cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr, 03438 const CvArr* eigenvects, CvArr* result_arr ) 03439 { 03440 cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr); 03441 cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0; 03442 03443 cv::PCA pca; 03444 pca.mean = mean; 03445 int n; 03446 if( mean.rows == 1 ) 03447 { 03448 CV_Assert(dst.cols <= evects.rows && dst.rows == data.rows); 03449 n = dst.cols; 03450 } 03451 else 03452 { 03453 CV_Assert(dst.rows <= evects.rows && dst.cols == data.cols); 03454 n = dst.rows; 03455 } 03456 pca.eigenvectors = evects.rowRange(0, n); 03457 03458 cv::Mat result = pca.project(data); 03459 if( result.cols != dst.cols ) 03460 result = result.reshape(1, 1); 03461 result.convertTo(dst, dst.type()); 03462 03463 CV_Assert(dst0.data == dst.data); 03464 } 03465 03466 03467 CV_IMPL void 03468 cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr, 03469 const CvArr* eigenvects, CvArr* result_arr ) 03470 { 03471 cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr); 03472 cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0; 03473 03474 cv::PCA pca; 03475 pca.mean = mean; 03476 int n; 03477 if( mean.rows == 1 ) 03478 { 03479 CV_Assert(data.cols <= evects.rows && dst.rows == data.rows); 03480 n = data.cols; 03481 } 03482 else 03483 { 03484 CV_Assert(data.rows <= evects.rows && dst.cols == data.cols); 03485 n = data.rows; 03486 } 03487 pca.eigenvectors = evects.rowRange(0, n); 03488 03489 cv::Mat result = pca.backProject(data); 03490 result.convertTo(dst, dst.type()); 03491 03492 CV_Assert(dst0.data == dst.data); 03493 } 03494 03495 /* End of file. */ 03496
Generated on Tue Jul 12 2022 14:47:21 by
