Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of gr-peach-opencv-project by
stat.cpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 00016 // Third party copyrights are property of their respective owners. 00017 // 00018 // Redistribution and use in source and binary forms, with or without modification, 00019 // are permitted provided that the following conditions are met: 00020 // 00021 // * Redistribution's of source code must retain the above copyright notice, 00022 // this list of conditions and the following disclaimer. 00023 // 00024 // * Redistribution's in binary form must reproduce the above copyright notice, 00025 // this list of conditions and the following disclaimer in the documentation 00026 // and/or other materials provided with the distribution. 00027 // 00028 // * The name of the copyright holders may not be used to endorse or promote products 00029 // derived from this software without specific prior written permission. 00030 // 00031 // This software is provided by the copyright holders and contributors "as is" and 00032 // any express or implied warranties, including, but not limited to, the implied 00033 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00034 // In no event shall the Intel Corporation or contributors be liable for any direct, 00035 // indirect, incidental, special, exemplary, or consequential damages 00036 // (including, but not limited to, procurement of substitute goods or services; 00037 // loss of use, data, or profits; or business interruption) however caused 00038 // and on any theory of liability, whether in contract, strict liability, 00039 // or tort (including negligence or otherwise) arising in any way out of 00040 // the use of this software, even if advised of the possibility of such damage. 00041 // 00042 //M*/ 00043 00044 #include "precomp.hpp" 00045 #include <climits> 00046 #include <limits> 00047 00048 #include "opencl_kernels_core.hpp" 00049 00050 namespace cv 00051 { 00052 00053 template<typename T> static inline Scalar rawToScalar(const T& v) 00054 { 00055 Scalar s; 00056 typedef typename DataType<T>::channel_type T1; 00057 int i, n = DataType<T>::channels; 00058 for( i = 0; i < n; i++ ) 00059 s.val[i] = ((T1*)&v)[i]; 00060 return s; 00061 } 00062 00063 /****************************************************************************************\ 00064 * sum * 00065 \****************************************************************************************/ 00066 00067 template <typename T, typename ST> 00068 struct Sum_SIMD 00069 { 00070 int operator () (const T *, const uchar *, ST *, int, int) const 00071 { 00072 return 0; 00073 } 00074 }; 00075 00076 #if CV_SSE2 00077 00078 template <> 00079 struct Sum_SIMD<schar, int> 00080 { 00081 int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const 00082 { 00083 if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) 00084 return 0; 00085 00086 int x = 0; 00087 __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero; 00088 00089 for ( ; x <= len - 16; x += 16) 00090 { 00091 __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); 00092 __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); 00093 00094 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); 00095 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); 00096 00097 v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); 00098 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); 00099 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); 00100 } 00101 00102 for ( ; x <= len - 8; x += 8) 00103 { 00104 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); 00105 00106 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 00107 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 00108 } 00109 00110 int CV_DECL_ALIGNED(16) ar[4]; 00111 _mm_store_si128((__m128i*)ar, v_sum); 00112 00113 for (int i = 0; i < 4; i += cn) 00114 for (int j = 0; j < cn; ++j) 00115 dst[j] += ar[j + i]; 00116 00117 return x / cn; 00118 } 00119 }; 00120 00121 template <> 00122 struct Sum_SIMD<int, double> 00123 { 00124 int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const 00125 { 00126 if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) 00127 return 0; 00128 00129 int x = 0; 00130 __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; 00131 00132 for ( ; x <= len - 4; x += 4) 00133 { 00134 __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x)); 00135 v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src)); 00136 v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8))); 00137 } 00138 00139 double CV_DECL_ALIGNED(16) ar[4]; 00140 _mm_store_pd(ar, v_sum0); 00141 _mm_store_pd(ar + 2, v_sum1); 00142 00143 for (int i = 0; i < 4; i += cn) 00144 for (int j = 0; j < cn; ++j) 00145 dst[j] += ar[j + i]; 00146 00147 return x / cn; 00148 } 00149 }; 00150 00151 template <> 00152 struct Sum_SIMD<float, double> 00153 { 00154 int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const 00155 { 00156 if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) 00157 return 0; 00158 00159 int x = 0; 00160 __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; 00161 00162 for ( ; x <= len - 4; x += 4) 00163 { 00164 __m128 v_src = _mm_loadu_ps(src0 + x); 00165 v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src)); 00166 v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); 00167 v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src)); 00168 } 00169 00170 double CV_DECL_ALIGNED(16) ar[4]; 00171 _mm_store_pd(ar, v_sum0); 00172 _mm_store_pd(ar + 2, v_sum1); 00173 00174 for (int i = 0; i < 4; i += cn) 00175 for (int j = 0; j < cn; ++j) 00176 dst[j] += ar[j + i]; 00177 00178 return x / cn; 00179 } 00180 }; 00181 00182 00183 #elif CV_NEON 00184 00185 template <> 00186 struct Sum_SIMD<uchar, int> 00187 { 00188 int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const 00189 { 00190 if (mask || (cn != 1 && cn != 2 && cn != 4)) 00191 return 0; 00192 00193 int x = 0; 00194 uint32x4_t v_sum = vdupq_n_u32(0u); 00195 00196 for ( ; x <= len - 16; x += 16) 00197 { 00198 uint8x16_t v_src = vld1q_u8(src0 + x); 00199 uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); 00200 00201 v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); 00202 v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); 00203 00204 v_half = vmovl_u8(vget_high_u8(v_src)); 00205 v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); 00206 v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); 00207 } 00208 00209 for ( ; x <= len - 8; x += 8) 00210 { 00211 uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x)); 00212 00213 v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); 00214 v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); 00215 } 00216 00217 unsigned int CV_DECL_ALIGNED(16) ar[4]; 00218 vst1q_u32(ar, v_sum); 00219 00220 for (int i = 0; i < 4; i += cn) 00221 for (int j = 0; j < cn; ++j) 00222 dst[j] += ar[j + i]; 00223 00224 return x / cn; 00225 } 00226 }; 00227 00228 template <> 00229 struct Sum_SIMD<schar, int> 00230 { 00231 int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const 00232 { 00233 if (mask || (cn != 1 && cn != 2 && cn != 4)) 00234 return 0; 00235 00236 int x = 0; 00237 int32x4_t v_sum = vdupq_n_s32(0); 00238 00239 for ( ; x <= len - 16; x += 16) 00240 { 00241 int8x16_t v_src = vld1q_s8(src0 + x); 00242 int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); 00243 00244 v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); 00245 v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); 00246 00247 v_half = vmovl_s8(vget_high_s8(v_src)); 00248 v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); 00249 v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); 00250 } 00251 00252 for ( ; x <= len - 8; x += 8) 00253 { 00254 int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x)); 00255 00256 v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); 00257 v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); 00258 } 00259 00260 int CV_DECL_ALIGNED(16) ar[4]; 00261 vst1q_s32(ar, v_sum); 00262 00263 for (int i = 0; i < 4; i += cn) 00264 for (int j = 0; j < cn; ++j) 00265 dst[j] += ar[j + i]; 00266 00267 return x / cn; 00268 } 00269 }; 00270 00271 template <> 00272 struct Sum_SIMD<ushort, int> 00273 { 00274 int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const 00275 { 00276 if (mask || (cn != 1 && cn != 2 && cn != 4)) 00277 return 0; 00278 00279 int x = 0; 00280 uint32x4_t v_sum = vdupq_n_u32(0u); 00281 00282 for ( ; x <= len - 8; x += 8) 00283 { 00284 uint16x8_t v_src = vld1q_u16(src0 + x); 00285 00286 v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); 00287 v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); 00288 } 00289 00290 for ( ; x <= len - 4; x += 4) 00291 v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x)); 00292 00293 unsigned int CV_DECL_ALIGNED(16) ar[4]; 00294 vst1q_u32(ar, v_sum); 00295 00296 for (int i = 0; i < 4; i += cn) 00297 for (int j = 0; j < cn; ++j) 00298 dst[j] += ar[j + i]; 00299 00300 return x / cn; 00301 } 00302 }; 00303 00304 template <> 00305 struct Sum_SIMD<short, int> 00306 { 00307 int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const 00308 { 00309 if (mask || (cn != 1 && cn != 2 && cn != 4)) 00310 return 0; 00311 00312 int x = 0; 00313 int32x4_t v_sum = vdupq_n_s32(0u); 00314 00315 for ( ; x <= len - 8; x += 8) 00316 { 00317 int16x8_t v_src = vld1q_s16(src0 + x); 00318 00319 v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); 00320 v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); 00321 } 00322 00323 for ( ; x <= len - 4; x += 4) 00324 v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x)); 00325 00326 int CV_DECL_ALIGNED(16) ar[4]; 00327 vst1q_s32(ar, v_sum); 00328 00329 for (int i = 0; i < 4; i += cn) 00330 for (int j = 0; j < cn; ++j) 00331 dst[j] += ar[j + i]; 00332 00333 return x / cn; 00334 } 00335 }; 00336 00337 #endif 00338 00339 template<typename T, typename ST> 00340 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn ) 00341 { 00342 const T* src = src0; 00343 if( !mask ) 00344 { 00345 Sum_SIMD<T, ST> vop; 00346 int i = vop(src0, mask, dst, len, cn), k = cn % 4; 00347 src += i * cn; 00348 00349 if( k == 1 ) 00350 { 00351 ST s0 = dst[0]; 00352 00353 #if CV_ENABLE_UNROLLED 00354 for(; i <= len - 4; i += 4, src += cn*4 ) 00355 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3]; 00356 #endif 00357 for( ; i < len; i++, src += cn ) 00358 s0 += src[0]; 00359 dst[0] = s0; 00360 } 00361 else if( k == 2 ) 00362 { 00363 ST s0 = dst[0], s1 = dst[1]; 00364 for( ; i < len; i++, src += cn ) 00365 { 00366 s0 += src[0]; 00367 s1 += src[1]; 00368 } 00369 dst[0] = s0; 00370 dst[1] = s1; 00371 } 00372 else if( k == 3 ) 00373 { 00374 ST s0 = dst[0], s1 = dst[1], s2 = dst[2]; 00375 for( ; i < len; i++, src += cn ) 00376 { 00377 s0 += src[0]; 00378 s1 += src[1]; 00379 s2 += src[2]; 00380 } 00381 dst[0] = s0; 00382 dst[1] = s1; 00383 dst[2] = s2; 00384 } 00385 00386 for( ; k < cn; k += 4 ) 00387 { 00388 src = src0 + i*cn + k; 00389 ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3]; 00390 for( ; i < len; i++, src += cn ) 00391 { 00392 s0 += src[0]; s1 += src[1]; 00393 s2 += src[2]; s3 += src[3]; 00394 } 00395 dst[k] = s0; 00396 dst[k+1] = s1; 00397 dst[k+2] = s2; 00398 dst[k+3] = s3; 00399 } 00400 return len; 00401 } 00402 00403 int i, nzm = 0; 00404 if( cn == 1 ) 00405 { 00406 ST s = dst[0]; 00407 for( i = 0; i < len; i++ ) 00408 if( mask[i] ) 00409 { 00410 s += src[i]; 00411 nzm++; 00412 } 00413 dst[0] = s; 00414 } 00415 else if( cn == 3 ) 00416 { 00417 ST s0 = dst[0], s1 = dst[1], s2 = dst[2]; 00418 for( i = 0; i < len; i++, src += 3 ) 00419 if( mask[i] ) 00420 { 00421 s0 += src[0]; 00422 s1 += src[1]; 00423 s2 += src[2]; 00424 nzm++; 00425 } 00426 dst[0] = s0; 00427 dst[1] = s1; 00428 dst[2] = s2; 00429 } 00430 else 00431 { 00432 for( i = 0; i < len; i++, src += cn ) 00433 if( mask[i] ) 00434 { 00435 int k = 0; 00436 #if CV_ENABLE_UNROLLED 00437 for( ; k <= cn - 4; k += 4 ) 00438 { 00439 ST s0, s1; 00440 s0 = dst[k] + src[k]; 00441 s1 = dst[k+1] + src[k+1]; 00442 dst[k] = s0; dst[k+1] = s1; 00443 s0 = dst[k+2] + src[k+2]; 00444 s1 = dst[k+3] + src[k+3]; 00445 dst[k+2] = s0; dst[k+3] = s1; 00446 } 00447 #endif 00448 for( ; k < cn; k++ ) 00449 dst[k] += src[k]; 00450 nzm++; 00451 } 00452 } 00453 return nzm; 00454 } 00455 00456 00457 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn ) 00458 { return sum_(src, mask, dst, len, cn); } 00459 00460 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn ) 00461 { return sum_(src, mask, dst, len, cn); } 00462 00463 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn ) 00464 { return sum_(src, mask, dst, len, cn); } 00465 00466 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn ) 00467 { return sum_(src, mask, dst, len, cn); } 00468 00469 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn ) 00470 { return sum_(src, mask, dst, len, cn); } 00471 00472 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn ) 00473 { return sum_(src, mask, dst, len, cn); } 00474 00475 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn ) 00476 { return sum_(src, mask, dst, len, cn); } 00477 00478 typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int); 00479 00480 static SumFunc getSumFunc(int depth) 00481 { 00482 static SumFunc sumTab[] = 00483 { 00484 (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s, 00485 (SumFunc)sum16u, (SumFunc)sum16s, 00486 (SumFunc)sum32s, 00487 (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f, 00488 0 00489 }; 00490 00491 return sumTab[depth]; 00492 } 00493 00494 template<typename T> 00495 static int countNonZero_(const T* src, int len ) 00496 { 00497 int i=0, nz = 0; 00498 #if CV_ENABLE_UNROLLED 00499 for(; i <= len - 4; i += 4 ) 00500 nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0); 00501 #endif 00502 for( ; i < len; i++ ) 00503 nz += src[i] != 0; 00504 return nz; 00505 } 00506 00507 static int countNonZero8u( const uchar* src, int len ) 00508 { 00509 int i=0, nz = 0; 00510 #if CV_SSE2 00511 if(USE_SSE2)//5x-6x 00512 { 00513 __m128i v_zero = _mm_setzero_si128(); 00514 __m128i sum = _mm_setzero_si128(); 00515 00516 for (; i<=len-16; i+=16) 00517 { 00518 __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i)); 00519 sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi8(r0, v_zero)), v_zero)); 00520 } 00521 nz = i - _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))); 00522 } 00523 #elif CV_NEON 00524 int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6; 00525 uint32x4_t v_nz = vdupq_n_u32(0u); 00526 uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1); 00527 const uchar * src0 = src; 00528 00529 while( i < len0 ) 00530 { 00531 int blockSizei = std::min(len0 - i, blockSize0), j = 0; 00532 00533 while (j < blockSizei) 00534 { 00535 int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; 00536 uint8x16_t v_pz = v_zero; 00537 00538 for( ; k <= blockSizej - 16; k += 16 ) 00539 v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1)); 00540 00541 uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz)); 00542 v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz); 00543 v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz); 00544 00545 src0 += blockSizej; 00546 j += blockSizej; 00547 } 00548 00549 i += blockSizei; 00550 } 00551 00552 CV_DECL_ALIGNED(16) unsigned int buf[4]; 00553 vst1q_u32(buf, v_nz); 00554 nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); 00555 #endif 00556 for( ; i < len; i++ ) 00557 nz += src[i] != 0; 00558 return nz; 00559 } 00560 00561 static int countNonZero16u( const ushort* src, int len ) 00562 { 00563 int i = 0, nz = 0; 00564 #if CV_SSE2 00565 if (USE_SSE2) 00566 { 00567 __m128i v_zero = _mm_setzero_si128 (); 00568 __m128i sum = _mm_setzero_si128(); 00569 00570 for ( ; i <= len - 8; i += 8) 00571 { 00572 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i)); 00573 sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi16(r0, v_zero)), v_zero)); 00574 } 00575 00576 nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 1); 00577 src += i; 00578 } 00579 #elif CV_NEON 00580 int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; 00581 uint32x4_t v_nz = vdupq_n_u32(0u); 00582 uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1); 00583 00584 while( i < len0 ) 00585 { 00586 int blockSizei = std::min(len0 - i, blockSize0), j = 0; 00587 00588 while (j < blockSizei) 00589 { 00590 int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; 00591 uint16x8_t v_pz = v_zero; 00592 00593 for( ; k <= blockSizej - 8; k += 8 ) 00594 v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1)); 00595 00596 v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); 00597 00598 src += blockSizej; 00599 j += blockSizej; 00600 } 00601 00602 i += blockSizei; 00603 } 00604 00605 CV_DECL_ALIGNED(16) unsigned int buf[4]; 00606 vst1q_u32(buf, v_nz); 00607 nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); 00608 #endif 00609 return nz + countNonZero_(src, len - i); 00610 } 00611 00612 static int countNonZero32s( const int* src, int len ) 00613 { 00614 int i = 0, nz = 0; 00615 #if CV_SSE2 00616 if (USE_SSE2) 00617 { 00618 __m128i v_zero = _mm_setzero_si128 (); 00619 __m128i sum = _mm_setzero_si128(); 00620 00621 for ( ; i <= len - 4; i += 4) 00622 { 00623 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i)); 00624 sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi32(r0, v_zero)), v_zero)); 00625 } 00626 00627 nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2); 00628 src += i; 00629 } 00630 #elif CV_NEON 00631 int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; 00632 uint32x4_t v_nz = vdupq_n_u32(0u); 00633 int32x4_t v_zero = vdupq_n_s32(0.0f); 00634 uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u); 00635 00636 while( i < len0 ) 00637 { 00638 int blockSizei = std::min(len0 - i, blockSize0), j = 0; 00639 00640 while (j < blockSizei) 00641 { 00642 int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; 00643 uint16x8_t v_pz = v_zerou; 00644 00645 for( ; k <= blockSizej - 8; k += 8 ) 00646 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)), 00647 vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1)); 00648 00649 v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); 00650 00651 src += blockSizej; 00652 j += blockSizej; 00653 } 00654 00655 i += blockSizei; 00656 } 00657 00658 CV_DECL_ALIGNED(16) unsigned int buf[4]; 00659 vst1q_u32(buf, v_nz); 00660 nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); 00661 #endif 00662 return nz + countNonZero_(src, len - i); 00663 } 00664 00665 static int countNonZero32f( const float* src, int len ) 00666 { 00667 int i = 0, nz = 0; 00668 #if CV_SSE2 00669 if (USE_SSE2) 00670 { 00671 __m128 v_zero_f = _mm_setzero_ps(); 00672 __m128i v_zero = _mm_setzero_si128 (); 00673 __m128i sum = _mm_setzero_si128(); 00674 00675 for ( ; i <= len - 4; i += 4) 00676 { 00677 __m128 r0 = _mm_loadu_ps(src + i); 00678 sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_castps_si128(_mm_cmpeq_ps(r0, v_zero_f))), v_zero)); 00679 } 00680 00681 nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2); 00682 src += i; 00683 } 00684 #elif CV_NEON 00685 int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; 00686 uint32x4_t v_nz = vdupq_n_u32(0u); 00687 float32x4_t v_zero = vdupq_n_f32(0.0f); 00688 uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u); 00689 00690 while( i < len0 ) 00691 { 00692 int blockSizei = std::min(len0 - i, blockSize0), j = 0; 00693 00694 while (j < blockSizei) 00695 { 00696 int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; 00697 uint16x8_t v_pz = v_zerou; 00698 00699 for( ; k <= blockSizej - 8; k += 8 ) 00700 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)), 00701 vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1)); 00702 00703 v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); 00704 00705 src += blockSizej; 00706 j += blockSizej; 00707 } 00708 00709 i += blockSizei; 00710 } 00711 00712 CV_DECL_ALIGNED(16) unsigned int buf[4]; 00713 vst1q_u32(buf, v_nz); 00714 nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); 00715 #endif 00716 return nz + countNonZero_(src, len - i); 00717 } 00718 00719 static int countNonZero64f( const double* src, int len ) 00720 { 00721 return countNonZero_(src, len); 00722 } 00723 00724 typedef int (*CountNonZeroFunc)(const uchar*, int); 00725 00726 static CountNonZeroFunc getCountNonZeroTab(int depth) 00727 { 00728 static CountNonZeroFunc countNonZeroTab[] = 00729 { 00730 (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), 00731 (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), 00732 (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f), 00733 (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0 00734 }; 00735 00736 return countNonZeroTab[depth]; 00737 } 00738 00739 template <typename T, typename ST, typename SQT> 00740 struct SumSqr_SIMD 00741 { 00742 int operator () (const T *, const uchar *, ST *, SQT *, int, int) const 00743 { 00744 return 0; 00745 } 00746 }; 00747 00748 #if CV_SSE2 00749 00750 template <> 00751 struct SumSqr_SIMD<uchar, int, int> 00752 { 00753 int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const 00754 { 00755 if (mask || (cn != 1 && cn != 2) || !USE_SSE2) 00756 return 0; 00757 00758 int x = 0; 00759 __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; 00760 00761 for ( ; x <= len - 16; x += 16) 00762 { 00763 __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); 00764 __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero); 00765 00766 __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); 00767 __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); 00768 v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); 00769 v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); 00770 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); 00771 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); 00772 00773 v_half = _mm_unpackhi_epi8(v_src, v_zero); 00774 v_mullo = _mm_mullo_epi16(v_half, v_half); 00775 v_mulhi = _mm_mulhi_epi16(v_half, v_half); 00776 v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); 00777 v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); 00778 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); 00779 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); 00780 } 00781 00782 for ( ; x <= len - 8; x += 8) 00783 { 00784 __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero); 00785 00786 __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); 00787 __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); 00788 v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero)); 00789 v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero)); 00790 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); 00791 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); 00792 } 00793 00794 int CV_DECL_ALIGNED(16) ar[8]; 00795 _mm_store_si128((__m128i*)ar, v_sum); 00796 _mm_store_si128((__m128i*)(ar + 4), v_sqsum); 00797 00798 for (int i = 0; i < 4; i += cn) 00799 for (int j = 0; j < cn; ++j) 00800 { 00801 sum[j] += ar[j + i]; 00802 sqsum[j] += ar[4 + j + i]; 00803 } 00804 00805 return x / cn; 00806 } 00807 }; 00808 00809 template <> 00810 struct SumSqr_SIMD<schar, int, int> 00811 { 00812 int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const 00813 { 00814 if (mask || (cn != 1 && cn != 2) || !USE_SSE2) 00815 return 0; 00816 00817 int x = 0; 00818 __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; 00819 00820 for ( ; x <= len - 16; x += 16) 00821 { 00822 __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); 00823 __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); 00824 00825 __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); 00826 __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); 00827 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); 00828 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); 00829 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); 00830 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); 00831 00832 v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); 00833 v_mullo = _mm_mullo_epi16(v_half, v_half); 00834 v_mulhi = _mm_mulhi_epi16(v_half, v_half); 00835 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); 00836 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); 00837 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); 00838 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); 00839 } 00840 00841 for ( ; x <= len - 8; x += 8) 00842 { 00843 __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); 00844 00845 __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); 00846 __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); 00847 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); 00848 v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); 00849 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); 00850 v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); 00851 } 00852 00853 int CV_DECL_ALIGNED(16) ar[8]; 00854 _mm_store_si128((__m128i*)ar, v_sum); 00855 _mm_store_si128((__m128i*)(ar + 4), v_sqsum); 00856 00857 for (int i = 0; i < 4; i += cn) 00858 for (int j = 0; j < cn; ++j) 00859 { 00860 sum[j] += ar[j + i]; 00861 sqsum[j] += ar[4 + j + i]; 00862 } 00863 00864 return x / cn; 00865 } 00866 }; 00867 00868 #endif 00869 00870 template<typename T, typename ST, typename SQT> 00871 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn ) 00872 { 00873 const T* src = src0; 00874 00875 if( !mask ) 00876 { 00877 SumSqr_SIMD<T, ST, SQT> vop; 00878 int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4; 00879 src += i * cn; 00880 00881 if( k == 1 ) 00882 { 00883 ST s0 = sum[0]; 00884 SQT sq0 = sqsum[0]; 00885 for( ; i < len; i++, src += cn ) 00886 { 00887 T v = src[0]; 00888 s0 += v; sq0 += (SQT)v*v; 00889 } 00890 sum[0] = s0; 00891 sqsum[0] = sq0; 00892 } 00893 else if( k == 2 ) 00894 { 00895 ST s0 = sum[0], s1 = sum[1]; 00896 SQT sq0 = sqsum[0], sq1 = sqsum[1]; 00897 for( ; i < len; i++, src += cn ) 00898 { 00899 T v0 = src[0], v1 = src[1]; 00900 s0 += v0; sq0 += (SQT)v0*v0; 00901 s1 += v1; sq1 += (SQT)v1*v1; 00902 } 00903 sum[0] = s0; sum[1] = s1; 00904 sqsum[0] = sq0; sqsum[1] = sq1; 00905 } 00906 else if( k == 3 ) 00907 { 00908 ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; 00909 SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; 00910 for( ; i < len; i++, src += cn ) 00911 { 00912 T v0 = src[0], v1 = src[1], v2 = src[2]; 00913 s0 += v0; sq0 += (SQT)v0*v0; 00914 s1 += v1; sq1 += (SQT)v1*v1; 00915 s2 += v2; sq2 += (SQT)v2*v2; 00916 } 00917 sum[0] = s0; sum[1] = s1; sum[2] = s2; 00918 sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2; 00919 } 00920 00921 for( ; k < cn; k += 4 ) 00922 { 00923 src = src0 + k; 00924 ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3]; 00925 SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3]; 00926 for( ; i < len; i++, src += cn ) 00927 { 00928 T v0, v1; 00929 v0 = src[0], v1 = src[1]; 00930 s0 += v0; sq0 += (SQT)v0*v0; 00931 s1 += v1; sq1 += (SQT)v1*v1; 00932 v0 = src[2], v1 = src[3]; 00933 s2 += v0; sq2 += (SQT)v0*v0; 00934 s3 += v1; sq3 += (SQT)v1*v1; 00935 } 00936 sum[k] = s0; sum[k+1] = s1; 00937 sum[k+2] = s2; sum[k+3] = s3; 00938 sqsum[k] = sq0; sqsum[k+1] = sq1; 00939 sqsum[k+2] = sq2; sqsum[k+3] = sq3; 00940 } 00941 return len; 00942 } 00943 00944 int i, nzm = 0; 00945 00946 if( cn == 1 ) 00947 { 00948 ST s0 = sum[0]; 00949 SQT sq0 = sqsum[0]; 00950 for( i = 0; i < len; i++ ) 00951 if( mask[i] ) 00952 { 00953 T v = src[i]; 00954 s0 += v; sq0 += (SQT)v*v; 00955 nzm++; 00956 } 00957 sum[0] = s0; 00958 sqsum[0] = sq0; 00959 } 00960 else if( cn == 3 ) 00961 { 00962 ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; 00963 SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; 00964 for( i = 0; i < len; i++, src += 3 ) 00965 if( mask[i] ) 00966 { 00967 T v0 = src[0], v1 = src[1], v2 = src[2]; 00968 s0 += v0; sq0 += (SQT)v0*v0; 00969 s1 += v1; sq1 += (SQT)v1*v1; 00970 s2 += v2; sq2 += (SQT)v2*v2; 00971 nzm++; 00972 } 00973 sum[0] = s0; sum[1] = s1; sum[2] = s2; 00974 sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2; 00975 } 00976 else 00977 { 00978 for( i = 0; i < len; i++, src += cn ) 00979 if( mask[i] ) 00980 { 00981 for( int k = 0; k < cn; k++ ) 00982 { 00983 T v = src[k]; 00984 ST s = sum[k] + v; 00985 SQT sq = sqsum[k] + (SQT)v*v; 00986 sum[k] = s; sqsum[k] = sq; 00987 } 00988 nzm++; 00989 } 00990 } 00991 return nzm; 00992 } 00993 00994 00995 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn ) 00996 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 00997 00998 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn ) 00999 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 01000 01001 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn ) 01002 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 01003 01004 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn ) 01005 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 01006 01007 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn ) 01008 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 01009 01010 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn ) 01011 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 01012 01013 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn ) 01014 { return sumsqr_(src, mask, sum, sqsum, len, cn); } 01015 01016 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int); 01017 01018 static SumSqrFunc getSumSqrTab(int depth) 01019 { 01020 static SumSqrFunc sumSqrTab[] = 01021 { 01022 (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s, 01023 (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0 01024 }; 01025 01026 return sumSqrTab[depth]; 01027 } 01028 01029 #ifdef HAVE_OPENCL 01030 01031 template <typename T> Scalar ocl_part_sum(Mat m) 01032 { 01033 CV_Assert(m.rows == 1); 01034 01035 Scalar s = Scalar::all(0); 01036 int cn = m.channels(); 01037 const T * const ptr = m.ptr<T>(0); 01038 01039 for (int x = 0, w = m.cols * cn; x < w; ) 01040 for (int c = 0; c < cn; ++c, ++x) 01041 s[c] += ptr[x]; 01042 01043 return s; 01044 } 01045 01046 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS = 1, OCL_OP_SUM_SQR = 2 }; 01047 01048 static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(), 01049 InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() ) 01050 { 01051 CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); 01052 01053 const ocl::Device & dev = ocl::Device::getDefault(); 01054 bool doubleSupport = dev.doubleFPConfig() > 0, 01055 haveMask = _mask.kind() != _InputArray::NONE, 01056 haveSrc2 = _src2.kind() != _InputArray::NONE; 01057 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), 01058 kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1, 01059 mcn = std::max(cn, kercn); 01060 CV_Assert(!haveSrc2 || _src2.type() == type); 01061 int convert_cn = haveSrc2 ? mcn : cn; 01062 01063 if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) 01064 return false; 01065 01066 int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1); 01067 size_t wgs = dev.maxWorkGroupSize(); 01068 01069 int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth), 01070 dtype = CV_MAKE_TYPE(ddepth, cn); 01071 CV_Assert(!haveMask || _mask.type() == CV_8UC1); 01072 01073 int wgs2_aligned = 1; 01074 while (wgs2_aligned < (int)wgs) 01075 wgs2_aligned <<= 1; 01076 wgs2_aligned >>= 1; 01077 01078 static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; 01079 char cvt[2][40]; 01080 String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" 01081 " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s", 01082 ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), 01083 ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), 01084 ocl::typeToStr(ddepth), ddepth, cn, 01085 ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]), 01086 opMap[sum_op], (int)wgs, wgs2_aligned, 01087 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 01088 haveMask ? " -D HAVE_MASK" : "", 01089 _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", 01090 haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, 01091 haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", 01092 haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", 01093 depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert"); 01094 01095 ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); 01096 if (k.empty()) 01097 return false; 01098 01099 UMat src = _src.getUMat(), src2 = _src2.getUMat(), 01100 db(1, dbsize, dtype), mask = _mask.getUMat(); 01101 01102 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 01103 dbarg = ocl::KernelArg::PtrWriteOnly(db), 01104 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), 01105 src2arg = ocl::KernelArg::ReadOnlyNoSize(src2); 01106 01107 if (haveMask) 01108 { 01109 if (haveSrc2) 01110 k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg); 01111 else 01112 k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg); 01113 } 01114 else 01115 { 01116 if (haveSrc2) 01117 k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg); 01118 else 01119 k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg); 01120 } 01121 01122 size_t globalsize = ngroups * wgs; 01123 if (k.run(1, &globalsize, &wgs, false)) 01124 { 01125 typedef Scalar (*part_sum)(Mat m); 01126 part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> }, 01127 func = funcs[ddepth - CV_32S]; 01128 01129 Mat mres = db.getMat(ACCESS_READ); 01130 if (calc2) 01131 const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize)); 01132 01133 res = func(mres.colRange(0, ngroups)); 01134 return true; 01135 } 01136 return false; 01137 } 01138 01139 #endif 01140 01141 #ifdef HAVE_IPP 01142 static bool ipp_sum(Mat &src, Scalar &_res) 01143 { 01144 #if IPP_VERSION_X100 >= 700 01145 int cn = src.channels(); 01146 size_t total_size = src.total(); 01147 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; 01148 if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) 01149 { 01150 IppiSize sz = { cols, rows }; 01151 int type = src.type(); 01152 typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm); 01153 typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *); 01154 ippiSumFuncHint ippFuncHint = 01155 type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R : 01156 type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R : 01157 type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R : 01158 0; 01159 ippiSumFuncNoHint ippFuncNoHint = 01160 type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R : 01161 type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R : 01162 type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R : 01163 type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R : 01164 type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R : 01165 type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R : 01166 type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R : 01167 type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R : 01168 type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R : 01169 0; 01170 CV_Assert(!ippFuncHint || !ippFuncNoHint); 01171 if( ippFuncHint || ippFuncNoHint ) 01172 { 01173 Ipp64f res[4]; 01174 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) : 01175 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res); 01176 if( ret >= 0 ) 01177 { 01178 for( int i = 0; i < cn; i++ ) 01179 _res[i] = res[i]; 01180 return true; 01181 } 01182 } 01183 } 01184 #else 01185 CV_UNUSED(src); CV_UNUSED(_res); 01186 #endif 01187 return false; 01188 } 01189 #endif 01190 01191 } 01192 01193 cv::Scalar cv::sum( InputArray _src ) 01194 { 01195 #if defined HAVE_OPENCL || defined HAVE_IPP 01196 Scalar _res; 01197 #endif 01198 01199 #ifdef HAVE_OPENCL 01200 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, 01201 ocl_sum(_src, _res, OCL_OP_SUM), 01202 _res) 01203 #endif 01204 01205 Mat src = _src.getMat(); 01206 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res); 01207 01208 int k, cn = src.channels(), depth = src.depth(); 01209 SumFunc func = getSumFunc(depth); 01210 CV_Assert( cn <= 4 && func != 0 ); 01211 01212 const Mat* arrays[] = {&src, 0}; 01213 uchar* ptrs[1]; 01214 NAryMatIterator it(arrays, ptrs); 01215 Scalar s; 01216 int total = (int)it.size, blockSize = total, intSumBlockSize = 0; 01217 int j, count = 0; 01218 AutoBuffer<int> _buf; 01219 int* buf = (int*)&s[0]; 01220 size_t esz = 0; 01221 bool blockSum = depth < CV_32S; 01222 01223 if( blockSum ) 01224 { 01225 intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15); 01226 blockSize = std::min(blockSize, intSumBlockSize); 01227 _buf.allocate(cn); 01228 buf = _buf; 01229 01230 for( k = 0; k < cn; k++ ) 01231 buf[k] = 0; 01232 esz = src.elemSize(); 01233 } 01234 01235 for( size_t i = 0; i < it.nplanes; i++, ++it ) 01236 { 01237 for( j = 0; j < total; j += blockSize ) 01238 { 01239 int bsz = std::min(total - j, blockSize); 01240 func( ptrs[0], 0, (uchar*)buf, bsz, cn ); 01241 count += bsz; 01242 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) 01243 { 01244 for( k = 0; k < cn; k++ ) 01245 { 01246 s[k] += buf[k]; 01247 buf[k] = 0; 01248 } 01249 count = 0; 01250 } 01251 ptrs[0] += bsz*esz; 01252 } 01253 } 01254 return s; 01255 } 01256 01257 #ifdef HAVE_OPENCL 01258 01259 namespace cv { 01260 01261 static bool ocl_countNonZero( InputArray _src, int & res ) 01262 { 01263 int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src); 01264 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 01265 01266 if (depth == CV_64F && !doubleSupport) 01267 return false; 01268 01269 int dbsize = ocl::Device::getDefault().maxComputeUnits(); 01270 size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); 01271 01272 int wgs2_aligned = 1; 01273 while (wgs2_aligned < (int)wgs) 01274 wgs2_aligned <<= 1; 01275 wgs2_aligned >>= 1; 01276 01277 ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, 01278 format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO" 01279 " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s", 01280 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), 01281 ocl::typeToStr(depth), (int)wgs, kercn, 01282 wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", 01283 _src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); 01284 if (k.empty()) 01285 return false; 01286 01287 UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1); 01288 k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), 01289 dbsize, ocl::KernelArg::PtrWriteOnly(db)); 01290 01291 size_t globalsize = dbsize * wgs; 01292 if (k.run(1, &globalsize, &wgs, true)) 01293 return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true; 01294 return false; 01295 } 01296 01297 } 01298 01299 #endif 01300 01301 #if defined HAVE_IPP 01302 namespace cv { 01303 01304 static bool ipp_countNonZero( Mat &src, int &res ) 01305 { 01306 #if !defined HAVE_IPP_ICV_ONLY 01307 Ipp32s count = 0; 01308 IppStatus status = ippStsNoErr; 01309 01310 int type = src.type(), depth = CV_MAT_DEPTH(type); 01311 IppiSize roiSize = { src.cols, src.rows }; 01312 Ipp32s srcstep = (Ipp32s)src.step; 01313 if (src.isContinuous()) 01314 { 01315 roiSize.width = (Ipp32s)src.total(); 01316 roiSize.height = 1; 01317 srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type); 01318 } 01319 01320 if (depth == CV_8U) 01321 status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0); 01322 else if (depth == CV_32F) 01323 status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0); 01324 01325 if (status >= 0) 01326 { 01327 res = ((Ipp32s)src.total() - count); 01328 return true; 01329 } 01330 #else 01331 CV_UNUSED(src); CV_UNUSED(res); 01332 #endif 01333 return false; 01334 } 01335 } 01336 #endif 01337 01338 01339 int cv::countNonZero( InputArray _src ) 01340 { 01341 int type = _src.type(), cn = CV_MAT_CN(type); 01342 CV_Assert( cn == 1 ); 01343 01344 #if defined HAVE_OPENCL || defined HAVE_IPP 01345 int res = -1; 01346 #endif 01347 01348 #ifdef HAVE_OPENCL 01349 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, 01350 ocl_countNonZero(_src, res), 01351 res) 01352 #endif 01353 01354 Mat src = _src.getMat(); 01355 CV_IPP_RUN(0 && (_src.dims() <= 2 || _src.isContinuous()), ipp_countNonZero(src, res), res); 01356 01357 CountNonZeroFunc func = getCountNonZeroTab(src.depth()); 01358 CV_Assert( func != 0 ); 01359 01360 const Mat* arrays[] = {&src, 0}; 01361 uchar* ptrs[1]; 01362 NAryMatIterator it(arrays, ptrs); 01363 int total = (int)it.size, nz = 0; 01364 01365 for( size_t i = 0; i < it.nplanes; i++, ++it ) 01366 nz += func( ptrs[0], total ); 01367 01368 return nz; 01369 } 01370 01371 #if defined HAVE_IPP 01372 namespace cv 01373 { 01374 static bool ipp_mean( Mat &src, Mat &mask, Scalar &ret ) 01375 { 01376 #if IPP_VERSION_X100 >= 700 01377 size_t total_size = src.total(); 01378 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; 01379 if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) 01380 { 01381 IppiSize sz = { cols, rows }; 01382 int type = src.type(); 01383 if( !mask.empty() ) 01384 { 01385 typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *); 01386 ippiMaskMeanFuncC1 ippFuncC1 = 01387 type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR : 01388 type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR : 01389 type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR : 01390 0; 01391 if( ippFuncC1 ) 01392 { 01393 Ipp64f res; 01394 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 ) 01395 { 01396 ret = Scalar(res); 01397 return true; 01398 } 01399 } 01400 typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *); 01401 ippiMaskMeanFuncC3 ippFuncC3 = 01402 type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR : 01403 type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR : 01404 type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR : 01405 0; 01406 if( ippFuncC3 ) 01407 { 01408 Ipp64f res1, res2, res3; 01409 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 && 01410 ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 && 01411 ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 ) 01412 { 01413 ret = Scalar(res1, res2, res3); 01414 return true; 01415 } 01416 } 01417 } 01418 else 01419 { 01420 typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm); 01421 typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *); 01422 ippiMeanFuncHint ippFuncHint = 01423 type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R : 01424 type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R : 01425 type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R : 01426 0; 01427 ippiMeanFuncNoHint ippFuncNoHint = 01428 type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R : 01429 type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R : 01430 type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R : 01431 type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R : 01432 type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R : 01433 type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R : 01434 type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R : 01435 type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R : 01436 type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R : 01437 0; 01438 // Make sure only zero or one version of the function pointer is valid 01439 CV_Assert(!ippFuncHint || !ippFuncNoHint); 01440 if( ippFuncHint || ippFuncNoHint ) 01441 { 01442 Ipp64f res[4]; 01443 IppStatus status = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) : 01444 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res); 01445 if( status >= 0 ) 01446 { 01447 for( int i = 0; i < src.channels(); i++ ) 01448 ret[i] = res[i]; 01449 return true; 01450 } 01451 } 01452 } 01453 } 01454 return false; 01455 #else 01456 return false; 01457 #endif 01458 } 01459 } 01460 #endif 01461 01462 cv::Scalar cv::mean( InputArray _src, InputArray _mask ) 01463 { 01464 Mat src = _src.getMat(), mask = _mask.getMat(); 01465 CV_Assert( mask.empty() || mask.type() == CV_8U ); 01466 01467 int k, cn = src.channels(), depth = src.depth(); 01468 Scalar s; 01469 01470 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_mean(src, mask, s), s) 01471 01472 SumFunc func = getSumFunc(depth); 01473 01474 CV_Assert( cn <= 4 && func != 0 ); 01475 01476 const Mat* arrays[] = {&src, &mask, 0}; 01477 uchar* ptrs[2]; 01478 NAryMatIterator it(arrays, ptrs); 01479 int total = (int)it.size, blockSize = total, intSumBlockSize = 0; 01480 int j, count = 0; 01481 AutoBuffer<int> _buf; 01482 int* buf = (int*)&s[0]; 01483 bool blockSum = depth <= CV_16S; 01484 size_t esz = 0, nz0 = 0; 01485 01486 if( blockSum ) 01487 { 01488 intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15); 01489 blockSize = std::min(blockSize, intSumBlockSize); 01490 _buf.allocate(cn); 01491 buf = _buf; 01492 01493 for( k = 0; k < cn; k++ ) 01494 buf[k] = 0; 01495 esz = src.elemSize(); 01496 } 01497 01498 for( size_t i = 0; i < it.nplanes; i++, ++it ) 01499 { 01500 for( j = 0; j < total; j += blockSize ) 01501 { 01502 int bsz = std::min(total - j, blockSize); 01503 int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn ); 01504 count += nz; 01505 nz0 += nz; 01506 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) 01507 { 01508 for( k = 0; k < cn; k++ ) 01509 { 01510 s[k] += buf[k]; 01511 buf[k] = 0; 01512 } 01513 count = 0; 01514 } 01515 ptrs[0] += bsz*esz; 01516 if( ptrs[1] ) 01517 ptrs[1] += bsz; 01518 } 01519 } 01520 return s*(nz0 ? 1./nz0 : 0); 01521 } 01522 01523 #ifdef HAVE_OPENCL 01524 01525 namespace cv { 01526 01527 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask ) 01528 { 01529 bool haveMask = _mask.kind() != _InputArray::NONE; 01530 int nz = haveMask ? -1 : (int)_src.total(); 01531 Scalar mean, stddev; 01532 01533 { 01534 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 01535 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, 01536 isContinuous = _src.isContinuous(), 01537 isMaskContinuous = _mask.isContinuous(); 01538 const ocl::Device &defDev = ocl::Device::getDefault(); 01539 int groups = defDev.maxComputeUnits(); 01540 if (defDev.isIntel()) 01541 { 01542 static const int subSliceEUCount = 10; 01543 groups = (groups / subSliceEUCount) * 2; 01544 } 01545 size_t wgs = defDev.maxWorkGroupSize(); 01546 01547 int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth), 01548 dtype = CV_MAKE_TYPE(ddepth, cn), 01549 sqdtype = CV_MAKETYPE(sqddepth, cn); 01550 CV_Assert(!haveMask || _mask.type() == CV_8UC1); 01551 01552 int wgs2_aligned = 1; 01553 while (wgs2_aligned < (int)wgs) 01554 wgs2_aligned <<= 1; 01555 wgs2_aligned >>= 1; 01556 01557 if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) 01558 return false; 01559 01560 char cvt[2][40]; 01561 String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d" 01562 " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s" 01563 " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", 01564 ocl::typeToStr(type), ocl::typeToStr(depth), 01565 ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth, 01566 ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth), 01567 ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]), 01568 cn, isContinuous ? " -D HAVE_SRC_CONT" : "", 01569 isMaskContinuous ? " -D HAVE_MASK_CONT" : "", 01570 ocl::convertTypeStr(depth, ddepth, cn, cvt[1]), 01571 (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "", 01572 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 01573 01574 ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts); 01575 if (k.empty()) 01576 return false; 01577 01578 int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) + 01579 CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype)); 01580 UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); 01581 01582 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), 01583 dbarg = ocl::KernelArg::PtrWriteOnly(db), 01584 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); 01585 01586 if (haveMask) 01587 k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg); 01588 else 01589 k.args(srcarg, src.cols, (int)src.total(), groups, dbarg); 01590 01591 size_t globalsize = groups * wgs; 01592 if (!k.run(1, &globalsize, &wgs, false)) 01593 return false; 01594 01595 typedef Scalar (* part_sum)(Mat m); 01596 part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> }; 01597 Mat dbm = db.getMat(ACCESS_READ); 01598 01599 mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr())); 01600 stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype))); 01601 01602 if (haveMask) 01603 nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() + 01604 groups * (CV_ELEM_SIZE(dtype) + 01605 CV_ELEM_SIZE(sqdtype))))[0]); 01606 } 01607 01608 double total = nz != 0 ? 1.0 / nz : 0; 01609 int k, j, cn = _src.channels(); 01610 for (int i = 0; i < cn; ++i) 01611 { 01612 mean[i] *= total; 01613 stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.)); 01614 } 01615 01616 for( j = 0; j < 2; j++ ) 01617 { 01618 const double * const sptr = j == 0 ? &mean[0] : &stddev[0]; 01619 _OutputArray _dst = j == 0 ? _mean : _sdv; 01620 if( !_dst.needed() ) 01621 continue; 01622 01623 if( !_dst.fixedSize() ) 01624 _dst.create(cn, 1, CV_64F, -1, true); 01625 Mat dst = _dst.getMat(); 01626 int dcn = (int)dst.total(); 01627 CV_Assert( dst.type() == CV_64F && dst.isContinuous() && 01628 (dst.cols == 1 || dst.rows == 1) && dcn >= cn ); 01629 double* dptr = dst.ptr<double>(); 01630 for( k = 0; k < cn; k++ ) 01631 dptr[k] = sptr[k]; 01632 for( ; k < dcn; k++ ) 01633 dptr[k] = 0; 01634 } 01635 01636 return true; 01637 } 01638 01639 } 01640 01641 #endif 01642 01643 #ifdef HAVE_IPP 01644 namespace cv 01645 { 01646 static bool ipp_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask) 01647 { 01648 #if IPP_VERSION_X100 >= 700 01649 int cn = src.channels(); 01650 size_t total_size = src.total(); 01651 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; 01652 if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) 01653 { 01654 Ipp64f mean_temp[3]; 01655 Ipp64f stddev_temp[3]; 01656 Ipp64f *pmean = &mean_temp[0]; 01657 Ipp64f *pstddev = &stddev_temp[0]; 01658 Mat mean, stddev; 01659 int dcn_mean = -1; 01660 if( _mean.needed() ) 01661 { 01662 if( !_mean.fixedSize() ) 01663 _mean.create(cn, 1, CV_64F, -1, true); 01664 mean = _mean.getMat(); 01665 dcn_mean = (int)mean.total(); 01666 pmean = mean.ptr<Ipp64f>(); 01667 } 01668 int dcn_stddev = -1; 01669 if( _sdv.needed() ) 01670 { 01671 if( !_sdv.fixedSize() ) 01672 _sdv.create(cn, 1, CV_64F, -1, true); 01673 stddev = _sdv.getMat(); 01674 dcn_stddev = (int)stddev.total(); 01675 pstddev = stddev.ptr<Ipp64f>(); 01676 } 01677 for( int c = cn; c < dcn_mean; c++ ) 01678 pmean[c] = 0; 01679 for( int c = cn; c < dcn_stddev; c++ ) 01680 pstddev[c] = 0; 01681 IppiSize sz = { cols, rows }; 01682 int type = src.type(); 01683 if( !mask.empty() ) 01684 { 01685 typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *); 01686 ippiMaskMeanStdDevFuncC1 ippFuncC1 = 01687 type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR : 01688 type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR : 01689 type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR : 01690 0; 01691 if( ippFuncC1 ) 01692 { 01693 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 ) 01694 { 01695 return true; 01696 } 01697 } 01698 typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *); 01699 ippiMaskMeanStdDevFuncC3 ippFuncC3 = 01700 type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR : 01701 type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR : 01702 type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR : 01703 0; 01704 if( ippFuncC3 ) 01705 { 01706 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 && 01707 ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 && 01708 ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 ) 01709 { 01710 return true; 01711 } 01712 } 01713 } 01714 else 01715 { 01716 typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *); 01717 ippiMeanStdDevFuncC1 ippFuncC1 = 01718 type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R : 01719 type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R : 01720 #if (IPP_VERSION_X100 >= 810) 01721 type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0 01722 #endif 01723 0; 01724 if( ippFuncC1 ) 01725 { 01726 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 ) 01727 { 01728 return true; 01729 } 01730 } 01731 typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *); 01732 ippiMeanStdDevFuncC3 ippFuncC3 = 01733 type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR : 01734 type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR : 01735 type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR : 01736 0; 01737 if( ippFuncC3 ) 01738 { 01739 if( ippFuncC3(src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 && 01740 ippFuncC3(src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 && 01741 ippFuncC3(src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 ) 01742 { 01743 return true; 01744 } 01745 } 01746 } 01747 } 01748 #else 01749 CV_UNUSED(src); CV_UNUSED(_mean); CV_UNUSED(_sdv); CV_UNUSED(mask); 01750 #endif 01751 return false; 01752 } 01753 } 01754 #endif 01755 01756 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask ) 01757 { 01758 #ifdef HAVE_OPENCL 01759 CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, 01760 ocl_meanStdDev(_src, _mean, _sdv, _mask)) 01761 #endif 01762 01763 Mat src = _src.getMat(), mask = _mask.getMat(); 01764 CV_Assert( mask.empty() || mask.type() == CV_8UC1 ); 01765 01766 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_meanStdDev(src, _mean, _sdv, mask)); 01767 01768 int k, cn = src.channels(), depth = src.depth(); 01769 01770 SumSqrFunc func = getSumSqrTab(depth); 01771 01772 CV_Assert( func != 0 ); 01773 01774 const Mat* arrays[] = {&src, &mask, 0}; 01775 uchar* ptrs[2]; 01776 NAryMatIterator it(arrays, ptrs); 01777 int total = (int)it.size, blockSize = total, intSumBlockSize = 0; 01778 int j, count = 0, nz0 = 0; 01779 AutoBuffer<double> _buf(cn*4); 01780 double *s = (double*)_buf, *sq = s + cn; 01781 int *sbuf = (int*)s, *sqbuf = (int*)sq; 01782 bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S; 01783 size_t esz = 0; 01784 01785 for( k = 0; k < cn; k++ ) 01786 s[k] = sq[k] = 0; 01787 01788 if( blockSum ) 01789 { 01790 intSumBlockSize = 1 << 15; 01791 blockSize = std::min(blockSize, intSumBlockSize); 01792 sbuf = (int*)(sq + cn); 01793 if( blockSqSum ) 01794 sqbuf = sbuf + cn; 01795 for( k = 0; k < cn; k++ ) 01796 sbuf[k] = sqbuf[k] = 0; 01797 esz = src.elemSize(); 01798 } 01799 01800 for( size_t i = 0; i < it.nplanes; i++, ++it ) 01801 { 01802 for( j = 0; j < total; j += blockSize ) 01803 { 01804 int bsz = std::min(total - j, blockSize); 01805 int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn ); 01806 count += nz; 01807 nz0 += nz; 01808 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) 01809 { 01810 for( k = 0; k < cn; k++ ) 01811 { 01812 s[k] += sbuf[k]; 01813 sbuf[k] = 0; 01814 } 01815 if( blockSqSum ) 01816 { 01817 for( k = 0; k < cn; k++ ) 01818 { 01819 sq[k] += sqbuf[k]; 01820 sqbuf[k] = 0; 01821 } 01822 } 01823 count = 0; 01824 } 01825 ptrs[0] += bsz*esz; 01826 if( ptrs[1] ) 01827 ptrs[1] += bsz; 01828 } 01829 } 01830 01831 double scale = nz0 ? 1./nz0 : 0.; 01832 for( k = 0; k < cn; k++ ) 01833 { 01834 s[k] *= scale; 01835 sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.)); 01836 } 01837 01838 for( j = 0; j < 2; j++ ) 01839 { 01840 const double* sptr = j == 0 ? s : sq; 01841 _OutputArray _dst = j == 0 ? _mean : _sdv; 01842 if( !_dst.needed() ) 01843 continue; 01844 01845 if( !_dst.fixedSize() ) 01846 _dst.create(cn, 1, CV_64F, -1, true); 01847 Mat dst = _dst.getMat(); 01848 int dcn = (int)dst.total(); 01849 CV_Assert( dst.type() == CV_64F && dst.isContinuous() && 01850 (dst.cols == 1 || dst.rows == 1) && dcn >= cn ); 01851 double* dptr = dst.ptr<double>(); 01852 for( k = 0; k < cn; k++ ) 01853 dptr[k] = sptr[k]; 01854 for( ; k < dcn; k++ ) 01855 dptr[k] = 0; 01856 } 01857 } 01858 01859 /****************************************************************************************\ 01860 * minMaxLoc * 01861 \****************************************************************************************/ 01862 01863 namespace cv 01864 { 01865 01866 template<typename T, typename WT> static void 01867 minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, 01868 size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx ) 01869 { 01870 WT minVal = *_minVal, maxVal = *_maxVal; 01871 size_t minIdx = *_minIdx, maxIdx = *_maxIdx; 01872 01873 if( !mask ) 01874 { 01875 for( int i = 0; i < len; i++ ) 01876 { 01877 T val = src[i]; 01878 if( val < minVal ) 01879 { 01880 minVal = val; 01881 minIdx = startIdx + i; 01882 } 01883 if( val > maxVal ) 01884 { 01885 maxVal = val; 01886 maxIdx = startIdx + i; 01887 } 01888 } 01889 } 01890 else 01891 { 01892 for( int i = 0; i < len; i++ ) 01893 { 01894 T val = src[i]; 01895 if( mask[i] && val < minVal ) 01896 { 01897 minVal = val; 01898 minIdx = startIdx + i; 01899 } 01900 if( mask[i] && val > maxVal ) 01901 { 01902 maxVal = val; 01903 maxIdx = startIdx + i; 01904 } 01905 } 01906 } 01907 01908 *_minIdx = minIdx; 01909 *_maxIdx = maxIdx; 01910 *_minVal = minVal; 01911 *_maxVal = maxVal; 01912 } 01913 01914 static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval, 01915 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01916 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01917 01918 static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval, 01919 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01920 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01921 01922 static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval, 01923 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01924 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01925 01926 static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval, 01927 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01928 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01929 01930 static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval, 01931 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01932 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01933 01934 static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval, 01935 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01936 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01937 01938 static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval, 01939 size_t* minidx, size_t* maxidx, int len, size_t startidx ) 01940 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } 01941 01942 typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t); 01943 01944 static MinMaxIdxFunc getMinmaxTab(int depth) 01945 { 01946 static MinMaxIdxFunc minmaxTab[] = 01947 { 01948 (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s), 01949 (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s), 01950 (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s), 01951 (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f), 01952 0 01953 }; 01954 01955 return minmaxTab[depth]; 01956 } 01957 01958 static void ofs2idx(const Mat& a, size_t ofs, int* idx) 01959 { 01960 int i, d = a.dims; 01961 if( ofs > 0 ) 01962 { 01963 ofs--; 01964 for( i = d-1; i >= 0; i-- ) 01965 { 01966 int sz = a.size[i]; 01967 idx[i] = (int)(ofs % sz); 01968 ofs /= sz; 01969 } 01970 } 01971 else 01972 { 01973 for( i = d-1; i >= 0; i-- ) 01974 idx[i] = -1; 01975 } 01976 } 01977 01978 #ifdef HAVE_OPENCL 01979 01980 #define MINMAX_STRUCT_ALIGNMENT 8 // sizeof double 01981 01982 template <typename T> 01983 void getMinMaxRes(const Mat & db, double * minVal, double * maxVal, 01984 int* minLoc, int* maxLoc, 01985 int groupnum, int cols, double * maxVal2) 01986 { 01987 uint index_max = std::numeric_limits<uint>::max(); 01988 T minval = std::numeric_limits<T>::max(); 01989 T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval; 01990 uint minloc = index_max, maxloc = index_max; 01991 01992 size_t index = 0; 01993 const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL; 01994 const uint * minlocptr = NULL, * maxlocptr = NULL; 01995 if (minVal || minLoc) 01996 { 01997 minptr = db.ptr<T>(); 01998 index += sizeof(T) * groupnum; 01999 index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); 02000 } 02001 if (maxVal || maxLoc) 02002 { 02003 maxptr = (const T *)(db.ptr() + index); 02004 index += sizeof(T) * groupnum; 02005 index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); 02006 } 02007 if (minLoc) 02008 { 02009 minlocptr = (const uint *)(db.ptr() + index); 02010 index += sizeof(uint) * groupnum; 02011 index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); 02012 } 02013 if (maxLoc) 02014 { 02015 maxlocptr = (const uint *)(db.ptr() + index); 02016 index += sizeof(uint) * groupnum; 02017 index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); 02018 } 02019 if (maxVal2) 02020 maxptr2 = (const T *)(db.ptr() + index); 02021 02022 for (int i = 0; i < groupnum; i++) 02023 { 02024 if (minptr && minptr[i] <= minval) 02025 { 02026 if (minptr[i] == minval) 02027 { 02028 if (minlocptr) 02029 minloc = std::min(minlocptr[i], minloc); 02030 } 02031 else 02032 { 02033 if (minlocptr) 02034 minloc = minlocptr[i]; 02035 minval = minptr[i]; 02036 } 02037 } 02038 if (maxptr && maxptr[i] >= maxval) 02039 { 02040 if (maxptr[i] == maxval) 02041 { 02042 if (maxlocptr) 02043 maxloc = std::min(maxlocptr[i], maxloc); 02044 } 02045 else 02046 { 02047 if (maxlocptr) 02048 maxloc = maxlocptr[i]; 02049 maxval = maxptr[i]; 02050 } 02051 } 02052 if (maxptr2 && maxptr2[i] > maxval2) 02053 maxval2 = maxptr2[i]; 02054 } 02055 bool zero_mask = (minLoc && minloc == index_max) || 02056 (maxLoc && maxloc == index_max); 02057 02058 if (minVal) 02059 *minVal = zero_mask ? 0 : (double)minval; 02060 if (maxVal) 02061 *maxVal = zero_mask ? 0 : (double)maxval; 02062 if (maxVal2) 02063 *maxVal2 = zero_mask ? 0 : (double)maxval2; 02064 02065 if (minLoc) 02066 { 02067 minLoc[0] = zero_mask ? -1 : minloc / cols; 02068 minLoc[1] = zero_mask ? -1 : minloc % cols; 02069 } 02070 if (maxLoc) 02071 { 02072 maxLoc[0] = zero_mask ? -1 : maxloc / cols; 02073 maxLoc[1] = zero_mask ? -1 : maxloc % cols; 02074 } 02075 } 02076 02077 typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal, 02078 int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2); 02079 02080 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, 02081 int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL) 02082 { 02083 const ocl::Device & dev = ocl::Device::getDefault(); 02084 02085 #ifdef ANDROID 02086 if (dev.isNVidia()) 02087 return false; 02088 #endif 02089 02090 bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), 02091 haveSrc2 = _src2.kind() != _InputArray::NONE; 02092 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), 02093 kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2)); 02094 02095 // disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014) 02096 if ((haveMask || type == CV_32FC1) && dev.isAMD()) 02097 return false; 02098 02099 CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) || 02100 (cn >= 1 && !minLoc && !maxLoc) ); 02101 02102 if (ddepth < 0) 02103 ddepth = depth; 02104 02105 CV_Assert(!haveSrc2 || _src2.type() == type); 02106 02107 if (depth == CV_32S) 02108 return false; 02109 02110 if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport) 02111 return false; 02112 02113 int groupnum = dev.maxComputeUnits(); 02114 size_t wgs = dev.maxWorkGroupSize(); 02115 02116 int wgs2_aligned = 1; 02117 while (wgs2_aligned < (int)wgs) 02118 wgs2_aligned <<= 1; 02119 wgs2_aligned >>= 1; 02120 02121 bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL, 02122 needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL; 02123 02124 // in case of mask we must know whether mask is filled with zeros or not 02125 // so let's calculate min or max location, if it's undefined, so mask is zeros 02126 if (!(needMaxLoc || needMinLoc) && haveMask) 02127 { 02128 if (needMinVal) 02129 needMinLoc = true; 02130 else 02131 needMaxLoc = true; 02132 } 02133 02134 char cvt[2][40]; 02135 String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" 02136 " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" 02137 " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s" 02138 " -D MINMAX_STRUCT_ALIGNMENT=%d", 02139 depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, 02140 ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, 02141 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 02142 _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", 02143 _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, 02144 needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", 02145 needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", 02146 ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), 02147 ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]), 02148 absValues ? " -D OP_ABS" : "", 02149 haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", 02150 haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth, 02151 depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert", 02152 MINMAX_STRUCT_ALIGNMENT); 02153 02154 ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); 02155 if (k.empty()) 02156 return false; 02157 02158 int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), 02159 dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + 02160 (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) + 02161 (maxVal2 ? esz : 0)) 02162 + 5 * MINMAX_STRUCT_ALIGNMENT; 02163 UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); 02164 02165 if (cn > 1 && !haveMask) 02166 { 02167 src = src.reshape(1); 02168 src2 = src2.reshape(1); 02169 } 02170 02171 if (haveSrc2) 02172 { 02173 if (!haveMask) 02174 k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), 02175 groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2)); 02176 else 02177 k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), 02178 groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask), 02179 ocl::KernelArg::ReadOnlyNoSize(src2)); 02180 } 02181 else 02182 { 02183 if (!haveMask) 02184 k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), 02185 groupnum, ocl::KernelArg::PtrWriteOnly(db)); 02186 else 02187 k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), 02188 groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); 02189 } 02190 02191 size_t globalsize = groupnum * wgs; 02192 if (!k.run(1, &globalsize, &wgs, true)) 02193 return false; 02194 02195 static const getMinMaxResFunc functab[7] = 02196 { 02197 getMinMaxRes<uchar>, 02198 getMinMaxRes<char>, 02199 getMinMaxRes<ushort>, 02200 getMinMaxRes<short>, 02201 getMinMaxRes<int>, 02202 getMinMaxRes<float>, 02203 getMinMaxRes<double> 02204 }; 02205 02206 getMinMaxResFunc func = functab[ddepth]; 02207 02208 int locTemp[2]; 02209 func(db.getMat(ACCESS_READ), minVal, maxVal, 02210 needMinLoc ? minLoc ? minLoc : locTemp : minLoc, 02211 needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, 02212 groupnum, src.cols, maxVal2); 02213 02214 return true; 02215 } 02216 02217 #endif 02218 02219 #ifdef HAVE_IPP 02220 static bool ipp_minMaxIdx( Mat &src, double* minVal, double* maxVal, int* minIdx, int* maxIdx, Mat &mask) 02221 { 02222 #if IPP_VERSION_X100 >= 700 02223 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 02224 size_t total_size = src.total(); 02225 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; 02226 if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) 02227 { 02228 IppiSize sz = { cols * cn, rows }; 02229 02230 if( !mask.empty() ) 02231 { 02232 typedef IppStatus (CV_STDCALL* ippiMaskMinMaxIndxFuncC1)(const void *, int, const void *, int, 02233 IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *); 02234 02235 CV_SUPPRESS_DEPRECATED_START 02236 ippiMaskMinMaxIndxFuncC1 ippFuncC1 = 02237 type == CV_8UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1MR : 02238 #if IPP_VERSION_X100 < 900 02239 type == CV_8SC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1MR : 02240 #endif 02241 type == CV_16UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1MR : 02242 type == CV_32FC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1MR : 0; 02243 CV_SUPPRESS_DEPRECATED_END 02244 02245 if( ippFuncC1 ) 02246 { 02247 Ipp32f min, max; 02248 IppiPoint minp, maxp; 02249 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &min, &max, &minp, &maxp) >= 0 ) 02250 { 02251 if( minVal ) 02252 *minVal = (double)min; 02253 if( maxVal ) 02254 *maxVal = (double)max; 02255 if( !minp.x && !minp.y && !maxp.x && !maxp.y && !mask.ptr()[0] ) 02256 minp.x = maxp.x = -1; 02257 if( minIdx ) 02258 { 02259 size_t minidx = minp.y * cols + minp.x + 1; 02260 ofs2idx(src, minidx, minIdx); 02261 } 02262 if( maxIdx ) 02263 { 02264 size_t maxidx = maxp.y * cols + maxp.x + 1; 02265 ofs2idx(src, maxidx, maxIdx); 02266 } 02267 return true; 02268 } 02269 } 02270 } 02271 else 02272 { 02273 typedef IppStatus (CV_STDCALL* ippiMinMaxIndxFuncC1)(const void *, int, IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *); 02274 02275 CV_SUPPRESS_DEPRECATED_START 02276 ippiMinMaxIndxFuncC1 ippFuncC1 = 02277 #if IPP_VERSION_X100 != 900 // bug in 9.0.0 avx2 optimization 02278 depth == CV_8U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1R : 02279 #endif 02280 #if IPP_VERSION_X100 < 900 02281 depth == CV_8S ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1R : 02282 #endif 02283 depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R : 02284 #if !((defined _MSC_VER && defined _M_IX86) || defined __i386__) 02285 depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R : 02286 #endif 02287 0; 02288 CV_SUPPRESS_DEPRECATED_END 02289 02290 if( ippFuncC1 ) 02291 { 02292 Ipp32f min, max; 02293 IppiPoint minp, maxp; 02294 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, &min, &max, &minp, &maxp) >= 0 ) 02295 { 02296 if( minVal ) 02297 *minVal = (double)min; 02298 if( maxVal ) 02299 *maxVal = (double)max; 02300 if( minIdx ) 02301 { 02302 size_t minidx = minp.y * cols + minp.x + 1; 02303 ofs2idx(src, minidx, minIdx); 02304 } 02305 if( maxIdx ) 02306 { 02307 size_t maxidx = maxp.y * cols + maxp.x + 1; 02308 ofs2idx(src, maxidx, maxIdx); 02309 } 02310 return true; 02311 } 02312 } 02313 } 02314 } 02315 #else 02316 #endif 02317 CV_UNUSED(src); CV_UNUSED(minVal); CV_UNUSED(maxVal); CV_UNUSED(minIdx); CV_UNUSED(maxIdx); CV_UNUSED(mask); 02318 return false; 02319 } 02320 #endif 02321 02322 } 02323 02324 void cv::minMaxIdx(InputArray _src, double* minVal, 02325 double* maxVal, int* minIdx, int* maxIdx, 02326 InputArray _mask) 02327 { 02328 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 02329 CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) || 02330 (cn > 1 && _mask.empty() && !minIdx && !maxIdx) ); 02331 02332 #ifdef HAVE_OPENCL 02333 CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2 && (_mask.empty() || _src.size() == _mask.size()), 02334 ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask)) 02335 #endif 02336 02337 Mat src = _src.getMat(), mask = _mask.getMat(); 02338 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask)) 02339 02340 MinMaxIdxFunc func = getMinmaxTab(depth); 02341 CV_Assert( func != 0 ); 02342 02343 const Mat* arrays[] = {&src, &mask, 0}; 02344 uchar* ptrs[2]; 02345 NAryMatIterator it(arrays, ptrs); 02346 02347 size_t minidx = 0, maxidx = 0; 02348 int iminval = INT_MAX, imaxval = INT_MIN; 02349 float fminval = std::numeric_limits<float>::infinity(), fmaxval = -fminval; 02350 double dminval = std::numeric_limits<double>::infinity(), dmaxval = -dminval; 02351 size_t startidx = 1; 02352 int *minval = &iminval, *maxval = &imaxval; 02353 int planeSize = (int)it.size*cn; 02354 02355 if( depth == CV_32F ) 02356 minval = (int*)&fminval, maxval = (int*)&fmaxval; 02357 else if( depth == CV_64F ) 02358 minval = (int*)&dminval, maxval = (int*)&dmaxval; 02359 02360 for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize ) 02361 func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx ); 02362 02363 if (!src.empty() && mask.empty()) 02364 { 02365 if( minidx == 0 ) 02366 minidx = 1; 02367 if( maxidx == 0 ) 02368 maxidx = 1; 02369 } 02370 02371 if( minidx == 0 ) 02372 dminval = dmaxval = 0; 02373 else if( depth == CV_32F ) 02374 dminval = fminval, dmaxval = fmaxval; 02375 else if( depth <= CV_32S ) 02376 dminval = iminval, dmaxval = imaxval; 02377 02378 if( minVal ) 02379 *minVal = dminval; 02380 if( maxVal ) 02381 *maxVal = dmaxval; 02382 02383 if( minIdx ) 02384 ofs2idx(src, minidx, minIdx); 02385 if( maxIdx ) 02386 ofs2idx(src, maxidx, maxIdx); 02387 } 02388 02389 void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal, 02390 Point* minLoc, Point* maxLoc, InputArray mask ) 02391 { 02392 CV_Assert(_img.dims() <= 2); 02393 02394 minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask); 02395 if( minLoc ) 02396 std::swap(minLoc->x, minLoc->y); 02397 if( maxLoc ) 02398 std::swap(maxLoc->x, maxLoc->y); 02399 } 02400 02401 /****************************************************************************************\ 02402 * norm * 02403 \****************************************************************************************/ 02404 02405 namespace cv 02406 { 02407 02408 template<typename T, typename ST> int 02409 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn) 02410 { 02411 ST result = *_result; 02412 if( !mask ) 02413 { 02414 result = std::max(result, normInf<T, ST>(src, len*cn)); 02415 } 02416 else 02417 { 02418 for( int i = 0; i < len; i++, src += cn ) 02419 if( mask[i] ) 02420 { 02421 for( int k = 0; k < cn; k++ ) 02422 result = std::max(result, ST(cv_abs(src[k]))); 02423 } 02424 } 02425 *_result = result; 02426 return 0; 02427 } 02428 02429 template<typename T, typename ST> int 02430 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn) 02431 { 02432 ST result = *_result; 02433 if( !mask ) 02434 { 02435 result += normL1<T, ST>(src, len*cn); 02436 } 02437 else 02438 { 02439 for( int i = 0; i < len; i++, src += cn ) 02440 if( mask[i] ) 02441 { 02442 for( int k = 0; k < cn; k++ ) 02443 result += cv_abs(src[k]); 02444 } 02445 } 02446 *_result = result; 02447 return 0; 02448 } 02449 02450 template<typename T, typename ST> int 02451 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn) 02452 { 02453 ST result = *_result; 02454 if( !mask ) 02455 { 02456 result += normL2Sqr<T, ST>(src, len*cn); 02457 } 02458 else 02459 { 02460 for( int i = 0; i < len; i++, src += cn ) 02461 if( mask[i] ) 02462 { 02463 for( int k = 0; k < cn; k++ ) 02464 { 02465 T v = src[k]; 02466 result += (ST)v*v; 02467 } 02468 } 02469 } 02470 *_result = result; 02471 return 0; 02472 } 02473 02474 template<typename T, typename ST> int 02475 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) 02476 { 02477 ST result = *_result; 02478 if( !mask ) 02479 { 02480 result = std::max(result, normInf<T, ST>(src1, src2, len*cn)); 02481 } 02482 else 02483 { 02484 for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) 02485 if( mask[i] ) 02486 { 02487 for( int k = 0; k < cn; k++ ) 02488 result = std::max(result, (ST)std::abs(src1[k] - src2[k])); 02489 } 02490 } 02491 *_result = result; 02492 return 0; 02493 } 02494 02495 template<typename T, typename ST> int 02496 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) 02497 { 02498 ST result = *_result; 02499 if( !mask ) 02500 { 02501 result += normL1<T, ST>(src1, src2, len*cn); 02502 } 02503 else 02504 { 02505 for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) 02506 if( mask[i] ) 02507 { 02508 for( int k = 0; k < cn; k++ ) 02509 result += std::abs(src1[k] - src2[k]); 02510 } 02511 } 02512 *_result = result; 02513 return 0; 02514 } 02515 02516 template<typename T, typename ST> int 02517 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn) 02518 { 02519 ST result = *_result; 02520 if( !mask ) 02521 { 02522 result += normL2Sqr<T, ST>(src1, src2, len*cn); 02523 } 02524 else 02525 { 02526 for( int i = 0; i < len; i++, src1 += cn, src2 += cn ) 02527 if( mask[i] ) 02528 { 02529 for( int k = 0; k < cn; k++ ) 02530 { 02531 ST v = src1[k] - src2[k]; 02532 result += v*v; 02533 } 02534 } 02535 } 02536 *_result = result; 02537 return 0; 02538 } 02539 02540 Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const 02541 { 02542 return cv::hal::normHamming(a, b, size); 02543 } 02544 02545 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \ 02546 static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \ 02547 { return norm##L##_(src, mask, r, len, cn); } \ 02548 static int normDiff##L##_##suffix(const type* src1, const type* src2, \ 02549 const uchar* mask, ntype* r, int len, int cn) \ 02550 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); } 02551 02552 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \ 02553 CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \ 02554 CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \ 02555 CV_DEF_NORM_FUNC(L2, suffix, type, l2type) 02556 02557 CV_DEF_NORM_ALL(8u, uchar, int, int, int) 02558 CV_DEF_NORM_ALL(8s, schar, int, int, int) 02559 CV_DEF_NORM_ALL(16u, ushort, int, int, double) 02560 CV_DEF_NORM_ALL(16s, short, int, int, double) 02561 CV_DEF_NORM_ALL(32s, int, int, double, double) 02562 CV_DEF_NORM_ALL(32f, float, float, double, double) 02563 CV_DEF_NORM_ALL(64f, double, double, double, double) 02564 02565 02566 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int); 02567 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int); 02568 02569 static NormFunc getNormFunc(int normType, int depth) 02570 { 02571 static NormFunc normTab[3][8] = 02572 { 02573 { 02574 (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s), 02575 (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0 02576 }, 02577 { 02578 (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s), 02579 (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0 02580 }, 02581 { 02582 (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s), 02583 (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0 02584 } 02585 }; 02586 02587 return normTab[normType][depth]; 02588 } 02589 02590 static NormDiffFunc getNormDiffFunc(int normType, int depth) 02591 { 02592 static NormDiffFunc normDiffTab[3][8] = 02593 { 02594 { 02595 (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s, 02596 (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s, 02597 (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f), 02598 (NormDiffFunc)normDiffInf_64f, 0 02599 }, 02600 { 02601 (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s, 02602 (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s, 02603 (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f), 02604 (NormDiffFunc)normDiffL1_64f, 0 02605 }, 02606 { 02607 (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s, 02608 (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s, 02609 (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f), 02610 (NormDiffFunc)normDiffL2_64f, 0 02611 } 02612 }; 02613 02614 return normDiffTab[normType][depth]; 02615 } 02616 02617 #ifdef HAVE_OPENCL 02618 02619 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result ) 02620 { 02621 const ocl::Device & d = ocl::Device::getDefault(); 02622 02623 #ifdef ANDROID 02624 if (d.isNVidia()) 02625 return false; 02626 #endif 02627 02628 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 02629 bool doubleSupport = d.doubleFPConfig() > 0, 02630 haveMask = _mask.kind() != _InputArray::NONE; 02631 02632 if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) || 02633 (!doubleSupport && depth == CV_64F)) 02634 return false; 02635 02636 UMat src = _src.getUMat(); 02637 02638 if (normType == NORM_INF) 02639 { 02640 if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask, 02641 std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U)) 02642 return false; 02643 } 02644 else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) 02645 { 02646 Scalar sc; 02647 bool unstype = depth == CV_8U || depth == CV_16U; 02648 02649 if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ? 02650 OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) ) 02651 return false; 02652 02653 if (!haveMask) 02654 cn = 1; 02655 02656 double s = 0.0; 02657 for (int i = 0; i < cn; ++i) 02658 s += sc[i]; 02659 02660 result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s); 02661 } 02662 02663 return true; 02664 } 02665 02666 #endif 02667 02668 #ifdef HAVE_IPP 02669 static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result) 02670 { 02671 #if IPP_VERSION_X100 >= 700 02672 int cn = src.channels(); 02673 size_t total_size = src.total(); 02674 int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; 02675 02676 if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous())) 02677 && cols > 0 && (size_t)rows*cols == total_size 02678 && (normType == NORM_INF || normType == NORM_L1 || 02679 normType == NORM_L2 || normType == NORM_L2SQR) ) 02680 { 02681 IppiSize sz = { cols, rows }; 02682 int type = src.type(); 02683 if( !mask.empty() ) 02684 { 02685 typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *); 02686 ippiMaskNormFuncC1 ippFuncC1 = 02687 normType == NORM_INF ? 02688 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR : 02689 #if IPP_VERSION_X100 < 900 02690 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8s_C1MR : 02691 #endif 02692 // type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR : 02693 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR : 02694 0) : 02695 normType == NORM_L1 ? 02696 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR : 02697 #if IPP_VERSION_X100 < 900 02698 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8s_C1MR : 02699 #endif 02700 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR : 02701 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR : 02702 0) : 02703 normType == NORM_L2 || normType == NORM_L2SQR ? 02704 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR : 02705 #if IPP_VERSION_X100 < 900 02706 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8s_C1MR : 02707 #endif 02708 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR : 02709 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR : 02710 0) : 0; 02711 if( ippFuncC1 ) 02712 { 02713 Ipp64f norm; 02714 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 ) 02715 { 02716 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm); 02717 return true; 02718 } 02719 } 02720 #if IPP_DISABLE_BLOCK 02721 typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *); 02722 ippiMaskNormFuncC3 ippFuncC3 = 02723 normType == NORM_INF ? 02724 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR : 02725 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR : 02726 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR : 02727 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR : 02728 0) : 02729 normType == NORM_L1 ? 02730 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR : 02731 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR : 02732 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR : 02733 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR : 02734 0) : 02735 normType == NORM_L2 || normType == NORM_L2SQR ? 02736 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR : 02737 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR : 02738 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR : 02739 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR : 02740 0) : 0; 02741 if( ippFuncC3 ) 02742 { 02743 Ipp64f norm1, norm2, norm3; 02744 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 && 02745 ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 && 02746 ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0) 02747 { 02748 Ipp64f norm = 02749 normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) : 02750 normType == NORM_L1 ? norm1 + norm2 + norm3 : 02751 normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) : 02752 0; 02753 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm); 02754 return true; 02755 } 02756 } 02757 #endif 02758 } 02759 else 02760 { 02761 typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint); 02762 typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *); 02763 ippiNormFuncHint ippFuncHint = 02764 normType == NORM_L1 ? 02765 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R : 02766 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R : 02767 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R : 02768 0) : 02769 normType == NORM_L2 || normType == NORM_L2SQR ? 02770 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R : 02771 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R : 02772 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R : 02773 0) : 0; 02774 ippiNormFuncNoHint ippFuncNoHint = 02775 normType == NORM_INF ? 02776 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R : 02777 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R : 02778 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R : 02779 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R : 02780 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R : 02781 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R : 02782 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R : 02783 #if (IPP_VERSION_X100 >= 810) 02784 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768 02785 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768 02786 #endif 02787 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R : 02788 type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R : 02789 type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R : 02790 0) : 02791 normType == NORM_L1 ? 02792 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R : 02793 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R : 02794 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R : 02795 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R : 02796 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R : 02797 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R : 02798 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R : 02799 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R : 02800 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R : 02801 0) : 02802 normType == NORM_L2 || normType == NORM_L2SQR ? 02803 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R : 02804 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R : 02805 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R : 02806 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R : 02807 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R : 02808 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R : 02809 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R : 02810 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R : 02811 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R : 02812 0) : 0; 02813 // Make sure only zero or one version of the function pointer is valid 02814 CV_Assert(!ippFuncHint || !ippFuncNoHint); 02815 if( ippFuncHint || ippFuncNoHint ) 02816 { 02817 Ipp64f norm_array[4]; 02818 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, norm_array, ippAlgHintAccurate) : 02819 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, norm_array); 02820 if( ret >= 0 ) 02821 { 02822 Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0]; 02823 for( int i = 1; i < cn; i++ ) 02824 { 02825 norm = 02826 normType == NORM_INF ? std::max(norm, norm_array[i]) : 02827 normType == NORM_L1 ? norm + norm_array[i] : 02828 normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] : 02829 0; 02830 } 02831 result = (normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm); 02832 return true; 02833 } 02834 } 02835 } 02836 } 02837 #else 02838 CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result); 02839 #endif 02840 return false; 02841 } 02842 #endif 02843 } 02844 02845 double cv::norm( InputArray _src, int normType, InputArray _mask ) 02846 { 02847 normType &= NORM_TYPE_MASK; 02848 CV_Assert( normType == NORM_INF || normType == NORM_L1 || 02849 normType == NORM_L2 || normType == NORM_L2SQR || 02850 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) ); 02851 02852 #if defined HAVE_OPENCL || defined HAVE_IPP 02853 double _result = 0; 02854 #endif 02855 02856 #ifdef HAVE_OPENCL 02857 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, 02858 ocl_norm(_src, normType, _mask, _result), 02859 _result) 02860 #endif 02861 02862 Mat src = _src.getMat(), mask = _mask.getMat(); 02863 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result); 02864 02865 int depth = src.depth(), cn = src.channels(); 02866 if( src.isContinuous() && mask.empty() ) 02867 { 02868 size_t len = src.total()*cn; 02869 if( len == (size_t)(int)len ) 02870 { 02871 if( depth == CV_32F ) 02872 { 02873 const float* data = src.ptr<float>(); 02874 02875 if( normType == NORM_L2 ) 02876 { 02877 double result = 0; 02878 GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1); 02879 return std::sqrt(result); 02880 } 02881 if( normType == NORM_L2SQR ) 02882 { 02883 double result = 0; 02884 GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1); 02885 return result; 02886 } 02887 if( normType == NORM_L1 ) 02888 { 02889 double result = 0; 02890 GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1); 02891 return result; 02892 } 02893 if( normType == NORM_INF ) 02894 { 02895 float result = 0; 02896 GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1); 02897 return result; 02898 } 02899 } 02900 if( depth == CV_8U ) 02901 { 02902 const uchar* data = src.ptr<uchar>(); 02903 02904 if( normType == NORM_HAMMING ) 02905 { 02906 return hal::normHamming(data, (int)len); 02907 } 02908 02909 if( normType == NORM_HAMMING2 ) 02910 { 02911 return hal::normHamming(data, (int)len, 2); 02912 } 02913 } 02914 } 02915 } 02916 02917 CV_Assert( mask.empty() || mask.type() == CV_8U ); 02918 02919 if( normType == NORM_HAMMING || normType == NORM_HAMMING2 ) 02920 { 02921 if( !mask.empty() ) 02922 { 02923 Mat temp; 02924 bitwise_and(src, mask, temp); 02925 return norm(temp, normType); 02926 } 02927 int cellSize = normType == NORM_HAMMING ? 1 : 2; 02928 02929 const Mat* arrays[] = {&src, 0}; 02930 uchar* ptrs[1]; 02931 NAryMatIterator it(arrays, ptrs); 02932 int total = (int)it.size; 02933 int result = 0; 02934 02935 for( size_t i = 0; i < it.nplanes; i++, ++it ) 02936 { 02937 result += hal::normHamming(ptrs[0], total, cellSize); 02938 } 02939 02940 return result; 02941 } 02942 02943 NormFunc func = getNormFunc(normType >> 1, depth); 02944 CV_Assert( func != 0 ); 02945 02946 const Mat* arrays[] = {&src, &mask, 0}; 02947 uchar* ptrs[2]; 02948 union 02949 { 02950 double d; 02951 int i; 02952 float f; 02953 } 02954 result; 02955 result.d = 0; 02956 NAryMatIterator it(arrays, ptrs); 02957 int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0; 02958 bool blockSum = (normType == NORM_L1 && depth <= CV_16S) || 02959 ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S); 02960 int isum = 0; 02961 int *ibuf = &result.i; 02962 size_t esz = 0; 02963 02964 if( blockSum ) 02965 { 02966 intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn; 02967 blockSize = std::min(blockSize, intSumBlockSize); 02968 ibuf = &isum; 02969 esz = src.elemSize(); 02970 } 02971 02972 for( size_t i = 0; i < it.nplanes; i++, ++it ) 02973 { 02974 for( j = 0; j < total; j += blockSize ) 02975 { 02976 int bsz = std::min(total - j, blockSize); 02977 func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn ); 02978 count += bsz; 02979 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) 02980 { 02981 result.d += isum; 02982 isum = 0; 02983 count = 0; 02984 } 02985 ptrs[0] += bsz*esz; 02986 if( ptrs[1] ) 02987 ptrs[1] += bsz; 02988 } 02989 } 02990 02991 if( normType == NORM_INF ) 02992 { 02993 if( depth == CV_64F ) 02994 ; 02995 else if( depth == CV_32F ) 02996 result.d = result.f; 02997 else 02998 result.d = result.i; 02999 } 03000 else if( normType == NORM_L2 ) 03001 result.d = std::sqrt(result.d); 03002 03003 return result.d; 03004 } 03005 03006 #ifdef HAVE_OPENCL 03007 03008 namespace cv { 03009 03010 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result ) 03011 { 03012 #ifdef ANDROID 03013 if (ocl::Device::getDefault().isNVidia()) 03014 return false; 03015 #endif 03016 03017 Scalar sc1, sc2; 03018 int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 03019 bool relative = (normType & NORM_RELATIVE) != 0; 03020 normType &= ~NORM_RELATIVE; 03021 bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR; 03022 03023 if (normsum) 03024 { 03025 if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ? 03026 OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2)) 03027 return false; 03028 } 03029 else 03030 { 03031 if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth), 03032 false, _src2, relative ? &sc2[0] : NULL)) 03033 return false; 03034 cn = 1; 03035 } 03036 03037 double s2 = 0; 03038 for (int i = 0; i < cn; ++i) 03039 { 03040 result += sc1[i]; 03041 if (relative) 03042 s2 += sc2[i]; 03043 } 03044 03045 if (normType == NORM_L2) 03046 { 03047 result = std::sqrt(result); 03048 if (relative) 03049 s2 = std::sqrt(s2); 03050 } 03051 03052 if (relative) 03053 result /= (s2 + DBL_EPSILON); 03054 03055 return true; 03056 } 03057 03058 } 03059 03060 #endif 03061 03062 #ifdef HAVE_IPP 03063 namespace cv 03064 { 03065 static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result) 03066 { 03067 #if IPP_VERSION_X100 >= 700 03068 Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat(); 03069 03070 if( normType & CV_RELATIVE ) 03071 { 03072 normType &= NORM_TYPE_MASK; 03073 CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR || 03074 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) ); 03075 size_t total_size = src1.total(); 03076 int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0; 03077 if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous())) 03078 && cols > 0 && (size_t)rows*cols == total_size 03079 && (normType == NORM_INF || normType == NORM_L1 || 03080 normType == NORM_L2 || normType == NORM_L2SQR) ) 03081 { 03082 IppiSize sz = { cols, rows }; 03083 int type = src1.type(); 03084 if( !mask.empty() ) 03085 { 03086 typedef IppStatus (CV_STDCALL* ippiMaskNormRelFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *); 03087 ippiMaskNormRelFuncC1 ippFuncC1 = 03088 normType == NORM_INF ? 03089 (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8u_C1MR : 03090 #if IPP_VERSION_X100 < 900 03091 #ifndef __APPLE__ 03092 type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8s_C1MR : 03093 #endif 03094 #endif 03095 type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_16u_C1MR : 03096 type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_32f_C1MR : 03097 0) : 03098 normType == NORM_L1 ? 03099 (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8u_C1MR : 03100 #if IPP_VERSION_X100 < 900 03101 #ifndef __APPLE__ 03102 type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8s_C1MR : 03103 #endif 03104 #endif 03105 type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_16u_C1MR : 03106 type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_32f_C1MR : 03107 0) : 03108 normType == NORM_L2 || normType == NORM_L2SQR ? 03109 (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8u_C1MR : 03110 #if IPP_VERSION_X100 < 900 03111 type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8s_C1MR : 03112 #endif 03113 type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_16u_C1MR : 03114 type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_32f_C1MR : 03115 0) : 0; 03116 if( ippFuncC1 ) 03117 { 03118 Ipp64f norm; 03119 if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 ) 03120 { 03121 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm); 03122 return true; 03123 } 03124 } 03125 } 03126 else 03127 { 03128 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *); 03129 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint); 03130 ippiNormRelFuncNoHint ippFuncNoHint = 03131 normType == NORM_INF ? 03132 (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R : 03133 type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R : 03134 type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R : 03135 type == CV_32FC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R : 03136 0) : 03137 normType == NORM_L1 ? 03138 (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R : 03139 type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R : 03140 type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R : 03141 0) : 03142 normType == NORM_L2 || normType == NORM_L2SQR ? 03143 (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R : 03144 type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R : 03145 type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R : 03146 0) : 0; 03147 ippiNormRelFuncHint ippFuncHint = 03148 normType == NORM_L1 ? 03149 (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R : 03150 0) : 03151 normType == NORM_L2 || normType == NORM_L2SQR ? 03152 (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R : 03153 0) : 0; 03154 if (ippFuncNoHint) 03155 { 03156 Ipp64f norm; 03157 if( ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm) >= 0 ) 03158 { 03159 result = (double)norm; 03160 return true; 03161 } 03162 } 03163 if (ippFuncHint) 03164 { 03165 Ipp64f norm; 03166 if( ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) >= 0 ) 03167 { 03168 result = (double)norm; 03169 return true; 03170 } 03171 } 03172 } 03173 } 03174 return false; 03175 } 03176 03177 normType &= 7; 03178 CV_Assert( normType == NORM_INF || normType == NORM_L1 || 03179 normType == NORM_L2 || normType == NORM_L2SQR || 03180 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) ); 03181 03182 size_t total_size = src1.total(); 03183 int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0; 03184 if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous())) 03185 && cols > 0 && (size_t)rows*cols == total_size 03186 && (normType == NORM_INF || normType == NORM_L1 || 03187 normType == NORM_L2 || normType == NORM_L2SQR) ) 03188 { 03189 IppiSize sz = { cols, rows }; 03190 int type = src1.type(); 03191 if( !mask.empty() ) 03192 { 03193 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *); 03194 ippiMaskNormDiffFuncC1 ippFuncC1 = 03195 normType == NORM_INF ? 03196 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR : 03197 #if IPP_VERSION_X100 < 900 03198 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8s_C1MR : 03199 #endif 03200 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR : 03201 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR : 03202 0) : 03203 normType == NORM_L1 ? 03204 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR : 03205 #if IPP_VERSION_X100 < 900 03206 #ifndef __APPLE__ 03207 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR : 03208 #endif 03209 #endif 03210 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR : 03211 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR : 03212 0) : 03213 normType == NORM_L2 || normType == NORM_L2SQR ? 03214 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR : 03215 #if IPP_VERSION_X100 < 900 03216 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8s_C1MR : 03217 #endif 03218 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR : 03219 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR : 03220 0) : 0; 03221 if( ippFuncC1 ) 03222 { 03223 Ipp64f norm; 03224 if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 ) 03225 { 03226 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm); 03227 return true; 03228 } 03229 } 03230 #ifndef __APPLE__ 03231 typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *); 03232 ippiMaskNormDiffFuncC3 ippFuncC3 = 03233 normType == NORM_INF ? 03234 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR : 03235 #if IPP_VERSION_X100 < 900 03236 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8s_C3CMR : 03237 #endif 03238 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR : 03239 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR : 03240 0) : 03241 normType == NORM_L1 ? 03242 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR : 03243 #if IPP_VERSION_X100 < 900 03244 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8s_C3CMR : 03245 #endif 03246 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR : 03247 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR : 03248 0) : 03249 normType == NORM_L2 || normType == NORM_L2SQR ? 03250 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR : 03251 #if IPP_VERSION_X100 < 900 03252 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8s_C3CMR : 03253 #endif 03254 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR : 03255 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR : 03256 0) : 0; 03257 if( ippFuncC3 ) 03258 { 03259 Ipp64f norm1, norm2, norm3; 03260 if( ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 && 03261 ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 && 03262 ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0) 03263 { 03264 Ipp64f norm = 03265 normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) : 03266 normType == NORM_L1 ? norm1 + norm2 + norm3 : 03267 normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) : 03268 0; 03269 result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm); 03270 return true; 03271 } 03272 } 03273 #endif 03274 } 03275 else 03276 { 03277 typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint); 03278 typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *); 03279 ippiNormDiffFuncHint ippFuncHint = 03280 normType == NORM_L1 ? 03281 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R : 03282 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R : 03283 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R : 03284 0) : 03285 normType == NORM_L2 || normType == NORM_L2SQR ? 03286 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R : 03287 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R : 03288 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R : 03289 0) : 0; 03290 ippiNormDiffFuncNoHint ippFuncNoHint = 03291 normType == NORM_INF ? 03292 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R : 03293 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R : 03294 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R : 03295 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R : 03296 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R : 03297 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R : 03298 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R : 03299 #if (IPP_VERSION_X100 >= 810) 03300 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768 03301 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768 03302 #endif 03303 type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R : 03304 type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R : 03305 type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R : 03306 0) : 03307 normType == NORM_L1 ? 03308 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R : 03309 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R : 03310 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R : 03311 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R : 03312 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R : 03313 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R : 03314 #if !(IPP_VERSION_X100 == 820 || IPP_VERSION_X100 == 821) // Oct 2014: Accuracy issue with IPP 8.2 / 8.2.1 03315 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R : 03316 #endif 03317 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R : 03318 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R : 03319 0) : 03320 normType == NORM_L2 || normType == NORM_L2SQR ? 03321 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R : 03322 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R : 03323 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R : 03324 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R : 03325 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R : 03326 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R : 03327 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R : 03328 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R : 03329 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R : 03330 0) : 0; 03331 // Make sure only zero or one version of the function pointer is valid 03332 CV_Assert(!ippFuncHint || !ippFuncNoHint); 03333 if( ippFuncHint || ippFuncNoHint ) 03334 { 03335 Ipp64f norm_array[4]; 03336 IppStatus ret = ippFuncHint ? ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) : 03337 ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array); 03338 if( ret >= 0 ) 03339 { 03340 Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0]; 03341 for( int i = 1; i < src1.channels(); i++ ) 03342 { 03343 norm = 03344 normType == NORM_INF ? std::max(norm, norm_array[i]) : 03345 normType == NORM_L1 ? norm + norm_array[i] : 03346 normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] : 03347 0; 03348 } 03349 result = (normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm); 03350 return true; 03351 } 03352 } 03353 } 03354 } 03355 #else 03356 CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result); 03357 #endif 03358 return false; 03359 } 03360 } 03361 #endif 03362 03363 03364 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask ) 03365 { 03366 CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() ); 03367 03368 #if defined HAVE_OPENCL || defined HAVE_IPP 03369 double _result = 0; 03370 #endif 03371 03372 #ifdef HAVE_OPENCL 03373 CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()), 03374 ocl_norm(_src1, _src2, normType, _mask, _result), 03375 _result) 03376 #endif 03377 03378 CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(_src1, _src2, normType, _mask, _result), _result); 03379 03380 if( normType & CV_RELATIVE ) 03381 { 03382 return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON); 03383 } 03384 03385 Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat(); 03386 int depth = src1.depth(), cn = src1.channels(); 03387 03388 normType &= 7; 03389 CV_Assert( normType == NORM_INF || normType == NORM_L1 || 03390 normType == NORM_L2 || normType == NORM_L2SQR || 03391 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) ); 03392 03393 if( src1.isContinuous() && src2.isContinuous() && mask.empty() ) 03394 { 03395 size_t len = src1.total()*src1.channels(); 03396 if( len == (size_t)(int)len ) 03397 { 03398 if( src1.depth() == CV_32F ) 03399 { 03400 const float* data1 = src1.ptr<float>(); 03401 const float* data2 = src2.ptr<float>(); 03402 03403 if( normType == NORM_L2 ) 03404 { 03405 double result = 0; 03406 GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1); 03407 return std::sqrt(result); 03408 } 03409 if( normType == NORM_L2SQR ) 03410 { 03411 double result = 0; 03412 GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1); 03413 return result; 03414 } 03415 if( normType == NORM_L1 ) 03416 { 03417 double result = 0; 03418 GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1); 03419 return result; 03420 } 03421 if( normType == NORM_INF ) 03422 { 03423 float result = 0; 03424 GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1); 03425 return result; 03426 } 03427 } 03428 } 03429 } 03430 03431 CV_Assert( mask.empty() || mask.type() == CV_8U ); 03432 03433 if( normType == NORM_HAMMING || normType == NORM_HAMMING2 ) 03434 { 03435 if( !mask.empty() ) 03436 { 03437 Mat temp; 03438 bitwise_xor(src1, src2, temp); 03439 bitwise_and(temp, mask, temp); 03440 return norm(temp, normType); 03441 } 03442 int cellSize = normType == NORM_HAMMING ? 1 : 2; 03443 03444 const Mat* arrays[] = {&src1, &src2, 0}; 03445 uchar* ptrs[2]; 03446 NAryMatIterator it(arrays, ptrs); 03447 int total = (int)it.size; 03448 int result = 0; 03449 03450 for( size_t i = 0; i < it.nplanes; i++, ++it ) 03451 { 03452 result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize); 03453 } 03454 03455 return result; 03456 } 03457 03458 NormDiffFunc func = getNormDiffFunc(normType >> 1, depth); 03459 CV_Assert( func != 0 ); 03460 03461 const Mat* arrays[] = {&src1, &src2, &mask, 0}; 03462 uchar* ptrs[3]; 03463 union 03464 { 03465 double d; 03466 float f; 03467 int i; 03468 unsigned u; 03469 } 03470 result; 03471 result.d = 0; 03472 NAryMatIterator it(arrays, ptrs); 03473 int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0; 03474 bool blockSum = (normType == NORM_L1 && depth <= CV_16S) || 03475 ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S); 03476 unsigned isum = 0; 03477 unsigned *ibuf = &result.u; 03478 size_t esz = 0; 03479 03480 if( blockSum ) 03481 { 03482 intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15); 03483 blockSize = std::min(blockSize, intSumBlockSize); 03484 ibuf = &isum; 03485 esz = src1.elemSize(); 03486 } 03487 03488 for( size_t i = 0; i < it.nplanes; i++, ++it ) 03489 { 03490 for( j = 0; j < total; j += blockSize ) 03491 { 03492 int bsz = std::min(total - j, blockSize); 03493 func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn ); 03494 count += bsz; 03495 if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) 03496 { 03497 result.d += isum; 03498 isum = 0; 03499 count = 0; 03500 } 03501 ptrs[0] += bsz*esz; 03502 ptrs[1] += bsz*esz; 03503 if( ptrs[2] ) 03504 ptrs[2] += bsz; 03505 } 03506 } 03507 03508 if( normType == NORM_INF ) 03509 { 03510 if( depth == CV_64F ) 03511 ; 03512 else if( depth == CV_32F ) 03513 result.d = result.f; 03514 else 03515 result.d = result.u; 03516 } 03517 else if( normType == NORM_L2 ) 03518 result.d = std::sqrt(result.d); 03519 03520 return result.d; 03521 } 03522 03523 03524 ///////////////////////////////////// batch distance /////////////////////////////////////// 03525 03526 namespace cv 03527 { 03528 03529 template<typename _Tp, typename _Rt> 03530 void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2, 03531 int nvecs, int len, _Rt* dist, const uchar* mask) 03532 { 03533 step2 /= sizeof(src2[0]); 03534 if( !mask ) 03535 { 03536 for( int i = 0; i < nvecs; i++ ) 03537 dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len); 03538 } 03539 else 03540 { 03541 _Rt val0 = std::numeric_limits<_Rt>::max(); 03542 for( int i = 0; i < nvecs; i++ ) 03543 dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0; 03544 } 03545 } 03546 03547 template<typename _Tp, typename _Rt> 03548 void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2, 03549 int nvecs, int len, _Rt* dist, const uchar* mask) 03550 { 03551 step2 /= sizeof(src2[0]); 03552 if( !mask ) 03553 { 03554 for( int i = 0; i < nvecs; i++ ) 03555 dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len); 03556 } 03557 else 03558 { 03559 _Rt val0 = std::numeric_limits<_Rt>::max(); 03560 for( int i = 0; i < nvecs; i++ ) 03561 dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0; 03562 } 03563 } 03564 03565 template<typename _Tp, typename _Rt> 03566 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2, 03567 int nvecs, int len, _Rt* dist, const uchar* mask) 03568 { 03569 step2 /= sizeof(src2[0]); 03570 if( !mask ) 03571 { 03572 for( int i = 0; i < nvecs; i++ ) 03573 dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)); 03574 } 03575 else 03576 { 03577 _Rt val0 = std::numeric_limits<_Rt>::max(); 03578 for( int i = 0; i < nvecs; i++ ) 03579 dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0; 03580 } 03581 } 03582 03583 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2, 03584 int nvecs, int len, int* dist, const uchar* mask) 03585 { 03586 step2 /= sizeof(src2[0]); 03587 if( !mask ) 03588 { 03589 for( int i = 0; i < nvecs; i++ ) 03590 dist[i] = hal::normHamming(src1, src2 + step2*i, len); 03591 } 03592 else 03593 { 03594 int val0 = INT_MAX; 03595 for( int i = 0; i < nvecs; i++ ) 03596 { 03597 if (mask[i]) 03598 dist[i] = hal::normHamming(src1, src2 + step2*i, len); 03599 else 03600 dist[i] = val0; 03601 } 03602 } 03603 } 03604 03605 static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2, 03606 int nvecs, int len, int* dist, const uchar* mask) 03607 { 03608 step2 /= sizeof(src2[0]); 03609 if( !mask ) 03610 { 03611 for( int i = 0; i < nvecs; i++ ) 03612 dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2); 03613 } 03614 else 03615 { 03616 int val0 = INT_MAX; 03617 for( int i = 0; i < nvecs; i++ ) 03618 { 03619 if (mask[i]) 03620 dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2); 03621 else 03622 dist[i] = val0; 03623 } 03624 } 03625 } 03626 03627 static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2, 03628 int nvecs, int len, int* dist, const uchar* mask) 03629 { 03630 batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask); 03631 } 03632 03633 static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2, 03634 int nvecs, int len, float* dist, const uchar* mask) 03635 { 03636 batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask); 03637 } 03638 03639 static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2, 03640 int nvecs, int len, int* dist, const uchar* mask) 03641 { 03642 batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask); 03643 } 03644 03645 static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2, 03646 int nvecs, int len, float* dist, const uchar* mask) 03647 { 03648 batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask); 03649 } 03650 03651 static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2, 03652 int nvecs, int len, float* dist, const uchar* mask) 03653 { 03654 batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask); 03655 } 03656 03657 static void batchDistL1_32f(const float* src1, const float* src2, size_t step2, 03658 int nvecs, int len, float* dist, const uchar* mask) 03659 { 03660 batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask); 03661 } 03662 03663 static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2, 03664 int nvecs, int len, float* dist, const uchar* mask) 03665 { 03666 batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask); 03667 } 03668 03669 static void batchDistL2_32f(const float* src1, const float* src2, size_t step2, 03670 int nvecs, int len, float* dist, const uchar* mask) 03671 { 03672 batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask); 03673 } 03674 03675 typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2, 03676 int nvecs, int len, uchar* dist, const uchar* mask); 03677 03678 03679 struct BatchDistInvoker : public ParallelLoopBody 03680 { 03681 BatchDistInvoker( const Mat& _src1, const Mat& _src2, 03682 Mat& _dist, Mat& _nidx, int _K, 03683 const Mat& _mask, int _update, 03684 BatchDistFunc _func) 03685 { 03686 src1 = &_src1; 03687 src2 = &_src2; 03688 dist = &_dist; 03689 nidx = &_nidx; 03690 K = _K; 03691 mask = &_mask; 03692 update = _update; 03693 func = _func; 03694 } 03695 03696 void operator()(const Range& range) const 03697 { 03698 AutoBuffer<int> buf(src2->rows); 03699 int* bufptr = buf; 03700 03701 for( int i = range.start; i < range.end; i++ ) 03702 { 03703 func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols, 03704 K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0); 03705 03706 if( K > 0 ) 03707 { 03708 int* nidxptr = nidx->ptr<int>(i); 03709 // since positive float's can be compared just like int's, 03710 // we handle both CV_32S and CV_32F cases with a single branch 03711 int* distptr = (int*)dist->ptr(i); 03712 03713 int j, k; 03714 03715 for( j = 0; j < src2->rows; j++ ) 03716 { 03717 int d = bufptr[j]; 03718 if( d < distptr[K-1] ) 03719 { 03720 for( k = K-2; k >= 0 && distptr[k] > d; k-- ) 03721 { 03722 nidxptr[k+1] = nidxptr[k]; 03723 distptr[k+1] = distptr[k]; 03724 } 03725 nidxptr[k+1] = j + update; 03726 distptr[k+1] = d; 03727 } 03728 } 03729 } 03730 } 03731 } 03732 03733 const Mat *src1; 03734 const Mat *src2; 03735 Mat *dist; 03736 Mat *nidx; 03737 const Mat *mask; 03738 int K; 03739 int update; 03740 BatchDistFunc func; 03741 }; 03742 03743 } 03744 03745 void cv::batchDistance( InputArray _src1, InputArray _src2, 03746 OutputArray _dist, int dtype, OutputArray _nidx, 03747 int normType, int K, InputArray _mask, 03748 int update, bool crosscheck ) 03749 { 03750 Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat(); 03751 int type = src1.type(); 03752 CV_Assert( type == src2.type() && src1.cols == src2.cols && 03753 (type == CV_32F || type == CV_8U)); 03754 CV_Assert( _nidx.needed() == (K > 0) ); 03755 03756 if( dtype == -1 ) 03757 { 03758 dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F; 03759 } 03760 CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F); 03761 03762 K = std::min(K, src2.rows); 03763 03764 _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype); 03765 Mat dist = _dist.getMat(), nidx; 03766 if( _nidx.needed() ) 03767 { 03768 _nidx.create(dist.size(), CV_32S); 03769 nidx = _nidx.getMat(); 03770 } 03771 03772 if( update == 0 && K > 0 ) 03773 { 03774 dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX); 03775 nidx = Scalar::all(-1); 03776 } 03777 03778 if( crosscheck ) 03779 { 03780 CV_Assert( K == 1 && update == 0 && mask.empty() ); 03781 Mat tdist, tidx; 03782 batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false); 03783 03784 // if an idx-th element from src1 appeared to be the nearest to i-th element of src2, 03785 // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set. 03786 // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest 03787 // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1. 03788 // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2. 03789 // This O(N) procedure is called cross-check and it helps to eliminate some false matches. 03790 if( dtype == CV_32S ) 03791 { 03792 for( int i = 0; i < tdist.rows; i++ ) 03793 { 03794 int idx = tidx.at<int>(i); 03795 int d = tdist.at<int>(i), d0 = dist.at<int>(idx); 03796 if( d < d0 ) 03797 { 03798 dist.at<int>(idx) = d; 03799 nidx.at<int>(idx) = i + update; 03800 } 03801 } 03802 } 03803 else 03804 { 03805 for( int i = 0; i < tdist.rows; i++ ) 03806 { 03807 int idx = tidx.at<int>(i); 03808 float d = tdist.at<float>(i), d0 = dist.at<float>(idx); 03809 if( d < d0 ) 03810 { 03811 dist.at<float>(idx) = d; 03812 nidx.at<int>(idx) = i + update; 03813 } 03814 } 03815 } 03816 return; 03817 } 03818 03819 BatchDistFunc func = 0; 03820 if( type == CV_8U ) 03821 { 03822 if( normType == NORM_L1 && dtype == CV_32S ) 03823 func = (BatchDistFunc)batchDistL1_8u32s; 03824 else if( normType == NORM_L1 && dtype == CV_32F ) 03825 func = (BatchDistFunc)batchDistL1_8u32f; 03826 else if( normType == NORM_L2SQR && dtype == CV_32S ) 03827 func = (BatchDistFunc)batchDistL2Sqr_8u32s; 03828 else if( normType == NORM_L2SQR && dtype == CV_32F ) 03829 func = (BatchDistFunc)batchDistL2Sqr_8u32f; 03830 else if( normType == NORM_L2 && dtype == CV_32F ) 03831 func = (BatchDistFunc)batchDistL2_8u32f; 03832 else if( normType == NORM_HAMMING && dtype == CV_32S ) 03833 func = (BatchDistFunc)batchDistHamming; 03834 else if( normType == NORM_HAMMING2 && dtype == CV_32S ) 03835 func = (BatchDistFunc)batchDistHamming2; 03836 } 03837 else if( type == CV_32F && dtype == CV_32F ) 03838 { 03839 if( normType == NORM_L1 ) 03840 func = (BatchDistFunc)batchDistL1_32f; 03841 else if( normType == NORM_L2SQR ) 03842 func = (BatchDistFunc)batchDistL2Sqr_32f; 03843 else if( normType == NORM_L2 ) 03844 func = (BatchDistFunc)batchDistL2_32f; 03845 } 03846 03847 if( func == 0 ) 03848 CV_Error_(CV_StsUnsupportedFormat, 03849 ("The combination of type=%d, dtype=%d and normType=%d is not supported", 03850 type, dtype, normType)); 03851 03852 parallel_for_(Range(0, src1.rows), 03853 BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func)); 03854 } 03855 03856 03857 void cv::findNonZero( InputArray _src, OutputArray _idx ) 03858 { 03859 Mat src = _src.getMat(); 03860 CV_Assert( src.type() == CV_8UC1 ); 03861 int n = countNonZero(src); 03862 if( n == 0 ) 03863 { 03864 _idx.release(); 03865 return; 03866 } 03867 if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() ) 03868 _idx.release(); 03869 _idx.create(n, 1, CV_32SC2); 03870 Mat idx = _idx.getMat(); 03871 CV_Assert(idx.isContinuous()); 03872 Point* idx_ptr = idx.ptr<Point>(); 03873 03874 for( int i = 0; i < src.rows; i++ ) 03875 { 03876 const uchar* bin_ptr = src.ptr(i); 03877 for( int j = 0; j < src.cols; j++ ) 03878 if( bin_ptr[j] ) 03879 *idx_ptr++ = Point(j, i); 03880 } 03881 } 03882 03883 double cv::PSNR(InputArray _src1, InputArray _src2) 03884 { 03885 CV_Assert( _src1.depth() == CV_8U ); 03886 double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels())); 03887 return 20*log10(255./(diff+DBL_EPSILON)); 03888 } 03889 03890 03891 CV_IMPL CvScalar cvSum( const CvArr* srcarr ) 03892 { 03893 cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1)); 03894 if( CV_IS_IMAGE(srcarr) ) 03895 { 03896 int coi = cvGetImageCOI((IplImage*)srcarr); 03897 if( coi ) 03898 { 03899 CV_Assert( 0 < coi && coi <= 4 ); 03900 sum = cv::Scalar (sum[coi-1]); 03901 } 03902 } 03903 return sum; 03904 } 03905 03906 CV_IMPL int cvCountNonZero( const CvArr* imgarr ) 03907 { 03908 cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1); 03909 if( img.channels() > 1 ) 03910 cv::extractImageCOI(imgarr, img); 03911 return countNonZero(img); 03912 } 03913 03914 03915 CV_IMPL CvScalar 03916 cvAvg( const void* imgarr, const void* maskarr ) 03917 { 03918 cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1); 03919 cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr)); 03920 if( CV_IS_IMAGE(imgarr) ) 03921 { 03922 int coi = cvGetImageCOI((IplImage*)imgarr); 03923 if( coi ) 03924 { 03925 CV_Assert( 0 < coi && coi <= 4 ); 03926 mean = cv::Scalar (mean[coi-1]); 03927 } 03928 } 03929 return mean; 03930 } 03931 03932 03933 CV_IMPL void 03934 cvAvgSdv( const CvArr* imgarr, CvScalar * _mean, CvScalar * _sdv, const void* maskarr ) 03935 { 03936 cv::Scalar mean, sdv; 03937 03938 cv::Mat mask; 03939 if( maskarr ) 03940 mask = cv::cvarrToMat(maskarr); 03941 03942 cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask ); 03943 03944 if( CV_IS_IMAGE(imgarr) ) 03945 { 03946 int coi = cvGetImageCOI((IplImage*)imgarr); 03947 if( coi ) 03948 { 03949 CV_Assert( 0 < coi && coi <= 4 ); 03950 mean = cv::Scalar (mean[coi-1]); 03951 sdv = cv::Scalar (sdv[coi-1]); 03952 } 03953 } 03954 03955 if( _mean ) 03956 *(cv::Scalar *)_mean = mean; 03957 if( _sdv ) 03958 *(cv::Scalar *)_sdv = sdv; 03959 } 03960 03961 03962 CV_IMPL void 03963 cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal, 03964 CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr ) 03965 { 03966 cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1); 03967 if( maskarr ) 03968 mask = cv::cvarrToMat(maskarr); 03969 if( img.channels() > 1 ) 03970 cv::extractImageCOI(imgarr, img); 03971 03972 cv::minMaxLoc( img, _minVal, _maxVal, 03973 (cv::Point *)_minLoc, (cv::Point *)_maxLoc, mask ); 03974 } 03975 03976 03977 CV_IMPL double 03978 cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr ) 03979 { 03980 cv::Mat a, mask; 03981 if( !imgA ) 03982 { 03983 imgA = imgB; 03984 imgB = 0; 03985 } 03986 03987 a = cv::cvarrToMat(imgA, false, true, 1); 03988 if( maskarr ) 03989 mask = cv::cvarrToMat(maskarr); 03990 03991 if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 ) 03992 cv::extractImageCOI(imgA, a); 03993 03994 if( !imgB ) 03995 return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask); 03996 03997 cv::Mat b = cv::cvarrToMat(imgB, false, true, 1); 03998 if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 ) 03999 cv::extractImageCOI(imgB, b); 04000 04001 return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask); 04002 } 04003 04004 namespace cv { namespace hal { 04005 04006 static const uchar popCountTable[] = 04007 { 04008 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 04009 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 04010 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 04011 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 04012 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 04013 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 04014 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 04015 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 04016 }; 04017 04018 static const uchar popCountTable2[] = 04019 { 04020 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 04021 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 04022 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 04023 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 04024 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 04025 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 04026 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 04027 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4 04028 }; 04029 04030 static const uchar popCountTable4[] = 04031 { 04032 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04033 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04034 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04035 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04036 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04037 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04038 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 04039 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 04040 }; 04041 04042 int normHamming(const uchar* a, int n) 04043 { 04044 int i = 0; 04045 int result = 0; 04046 #if CV_NEON 04047 { 04048 uint32x4_t bits = vmovq_n_u32(0); 04049 for (; i <= n - 16; i += 16) { 04050 uint8x16_t A_vec = vld1q_u8 (a + i); 04051 uint8x16_t bitsSet = vcntq_u8 (A_vec); 04052 uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); 04053 uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); 04054 bits = vaddq_u32(bits, bitSet4); 04055 } 04056 uint64x2_t bitSet2 = vpaddlq_u32 (bits); 04057 result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); 04058 result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); 04059 } 04060 #endif 04061 for( ; i <= n - 4; i += 4 ) 04062 result += popCountTable[a[i]] + popCountTable[a[i+1]] + 04063 popCountTable[a[i+2]] + popCountTable[a[i+3]]; 04064 for( ; i < n; i++ ) 04065 result += popCountTable[a[i]]; 04066 return result; 04067 } 04068 04069 int normHamming(const uchar* a, const uchar* b, int n) 04070 { 04071 int i = 0; 04072 int result = 0; 04073 #if CV_NEON 04074 { 04075 uint32x4_t bits = vmovq_n_u32(0); 04076 for (; i <= n - 16; i += 16) { 04077 uint8x16_t A_vec = vld1q_u8 (a + i); 04078 uint8x16_t B_vec = vld1q_u8 (b + i); 04079 uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); 04080 uint8x16_t bitsSet = vcntq_u8 (AxorB); 04081 uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); 04082 uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); 04083 bits = vaddq_u32(bits, bitSet4); 04084 } 04085 uint64x2_t bitSet2 = vpaddlq_u32 (bits); 04086 result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); 04087 result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); 04088 } 04089 #endif 04090 for( ; i <= n - 4; i += 4 ) 04091 result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + 04092 popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; 04093 for( ; i < n; i++ ) 04094 result += popCountTable[a[i] ^ b[i]]; 04095 return result; 04096 } 04097 04098 int normHamming(const uchar* a, int n, int cellSize) 04099 { 04100 if( cellSize == 1 ) 04101 return normHamming(a, n); 04102 const uchar* tab = 0; 04103 if( cellSize == 2 ) 04104 tab = popCountTable2; 04105 else if( cellSize == 4 ) 04106 tab = popCountTable4; 04107 else 04108 return -1; 04109 int i = 0; 04110 int result = 0; 04111 #if CV_ENABLE_UNROLLED 04112 for( ; i <= n - 4; i += 4 ) 04113 result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]]; 04114 #endif 04115 for( ; i < n; i++ ) 04116 result += tab[a[i]]; 04117 return result; 04118 } 04119 04120 int normHamming(const uchar* a, const uchar* b, int n, int cellSize) 04121 { 04122 if( cellSize == 1 ) 04123 return normHamming(a, b, n); 04124 const uchar* tab = 0; 04125 if( cellSize == 2 ) 04126 tab = popCountTable2; 04127 else if( cellSize == 4 ) 04128 tab = popCountTable4; 04129 else 04130 return -1; 04131 int i = 0; 04132 int result = 0; 04133 #if CV_ENABLE_UNROLLED 04134 for( ; i <= n - 4; i += 4 ) 04135 result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] + 04136 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]]; 04137 #endif 04138 for( ; i < n; i++ ) 04139 result += tab[a[i] ^ b[i]]; 04140 return result; 04141 } 04142 04143 float normL2Sqr_(const float* a, const float* b, int n) 04144 { 04145 int j = 0; float d = 0.f; 04146 #if CV_SSE 04147 float CV_DECL_ALIGNED(16) buf[4]; 04148 __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); 04149 04150 for( ; j <= n - 8; j += 8 ) 04151 { 04152 __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); 04153 __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); 04154 d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); 04155 d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); 04156 } 04157 _mm_store_ps(buf, _mm_add_ps(d0, d1)); 04158 d = buf[0] + buf[1] + buf[2] + buf[3]; 04159 #endif 04160 { 04161 for( ; j <= n - 4; j += 4 ) 04162 { 04163 float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; 04164 d += t0*t0 + t1*t1 + t2*t2 + t3*t3; 04165 } 04166 } 04167 04168 for( ; j < n; j++ ) 04169 { 04170 float t = a[j] - b[j]; 04171 d += t*t; 04172 } 04173 return d; 04174 } 04175 04176 04177 float normL1_(const float* a, const float* b, int n) 04178 { 04179 int j = 0; float d = 0.f; 04180 #if CV_SSE 04181 float CV_DECL_ALIGNED(16) buf[4]; 04182 static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; 04183 __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); 04184 __m128 absmask = _mm_load_ps((const float*)absbuf); 04185 04186 for( ; j <= n - 8; j += 8 ) 04187 { 04188 __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); 04189 __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); 04190 d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); 04191 d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); 04192 } 04193 _mm_store_ps(buf, _mm_add_ps(d0, d1)); 04194 d = buf[0] + buf[1] + buf[2] + buf[3]; 04195 #elif CV_NEON 04196 float32x4_t v_sum = vdupq_n_f32(0.0f); 04197 for ( ; j <= n - 4; j += 4) 04198 v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); 04199 04200 float CV_DECL_ALIGNED(16) buf[4]; 04201 vst1q_f32(buf, v_sum); 04202 d = buf[0] + buf[1] + buf[2] + buf[3]; 04203 #endif 04204 { 04205 for( ; j <= n - 4; j += 4 ) 04206 { 04207 d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + 04208 std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); 04209 } 04210 } 04211 04212 for( ; j < n; j++ ) 04213 d += std::abs(a[j] - b[j]); 04214 return d; 04215 } 04216 04217 int normL1_(const uchar* a, const uchar* b, int n) 04218 { 04219 int j = 0, d = 0; 04220 #if CV_SSE 04221 __m128i d0 = _mm_setzero_si128(); 04222 04223 for( ; j <= n - 16; j += 16 ) 04224 { 04225 __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); 04226 __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); 04227 04228 d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); 04229 } 04230 04231 for( ; j <= n - 4; j += 4 ) 04232 { 04233 __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); 04234 __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); 04235 04236 d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); 04237 } 04238 d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); 04239 #elif CV_NEON 04240 uint32x4_t v_sum = vdupq_n_u32(0.0f); 04241 for ( ; j <= n - 16; j += 16) 04242 { 04243 uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); 04244 uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); 04245 v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); 04246 v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); 04247 } 04248 04249 uint CV_DECL_ALIGNED(16) buf[4]; 04250 vst1q_u32(buf, v_sum); 04251 d = buf[0] + buf[1] + buf[2] + buf[3]; 04252 #endif 04253 { 04254 for( ; j <= n - 4; j += 4 ) 04255 { 04256 d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + 04257 std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); 04258 } 04259 } 04260 for( ; j < n; j++ ) 04261 d += std::abs(a[j] - b[j]); 04262 return d; 04263 } 04264 04265 }} //cv::hal 04266
Generated on Tue Jul 12 2022 15:17:31 by
1.7.2
