gr-peach-opencv-project

Users » thedo » Code » gr-peach-opencv-project » Documentation
Fork of gr-peach-opencv-project by the do
Embed: (wiki syntax)
Show/hide line numbers stat.cpp Source File
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 #include <climits>
00046 #include <limits>
00047 
00048 #include "opencl_kernels_core.hpp"
00049 
00050 namespace cv
00051 {
00052 
00053 template<typename T> static inline Scalar rawToScalar(const T& v)
00054 {
00055     Scalar s;
00056     typedef typename DataType<T>::channel_type T1;
00057     int i, n = DataType<T>::channels;
00058     for( i = 0; i < n; i++ )
00059         s.val[i] = ((T1*)&v)[i];
00060     return s;
00061 }
00062 
00063 /****************************************************************************************\
00064 *                                        sum                                             *
00065 \****************************************************************************************/
00066 
00067 template <typename T, typename ST>
00068 struct Sum_SIMD
00069 {
00070     int operator () (const T *, const uchar *, ST *, int, int) const
00071     {
00072         return 0;
00073     }
00074 };
00075 
00076 #if CV_SSE2
00077 
00078 template <>
00079 struct Sum_SIMD<schar, int>
00080 {
00081     int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
00082     {
00083         if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
00084             return 0;
00085 
00086         int x = 0;
00087         __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero;
00088 
00089         for ( ; x <= len - 16; x += 16)
00090         {
00091             __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
00092             __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
00093 
00094             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
00095             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
00096 
00097             v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
00098             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
00099             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
00100         }
00101 
00102         for ( ; x <= len - 8; x += 8)
00103         {
00104             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
00105 
00106             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
00107             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
00108         }
00109 
00110         int CV_DECL_ALIGNED(16) ar[4];
00111         _mm_store_si128((__m128i*)ar, v_sum);
00112 
00113         for (int i = 0; i < 4; i += cn)
00114             for (int j = 0; j < cn; ++j)
00115                 dst[j] += ar[j + i];
00116 
00117         return x / cn;
00118     }
00119 };
00120 
00121 template <>
00122 struct Sum_SIMD<int, double>
00123 {
00124     int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
00125     {
00126         if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
00127             return 0;
00128 
00129         int x = 0;
00130         __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
00131 
00132         for ( ; x <= len - 4; x += 4)
00133         {
00134             __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x));
00135             v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src));
00136             v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8)));
00137         }
00138 
00139         double CV_DECL_ALIGNED(16) ar[4];
00140         _mm_store_pd(ar, v_sum0);
00141         _mm_store_pd(ar + 2, v_sum1);
00142 
00143         for (int i = 0; i < 4; i += cn)
00144             for (int j = 0; j < cn; ++j)
00145                 dst[j] += ar[j + i];
00146 
00147         return x / cn;
00148     }
00149 };
00150 
00151 template <>
00152 struct Sum_SIMD<float, double>
00153 {
00154     int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
00155     {
00156         if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
00157             return 0;
00158 
00159         int x = 0;
00160         __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
00161 
00162         for ( ; x <= len - 4; x += 4)
00163         {
00164             __m128 v_src = _mm_loadu_ps(src0 + x);
00165             v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src));
00166             v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
00167             v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src));
00168         }
00169 
00170         double CV_DECL_ALIGNED(16) ar[4];
00171         _mm_store_pd(ar, v_sum0);
00172         _mm_store_pd(ar + 2, v_sum1);
00173 
00174         for (int i = 0; i < 4; i += cn)
00175             for (int j = 0; j < cn; ++j)
00176                 dst[j] += ar[j + i];
00177 
00178         return x / cn;
00179     }
00180 };
00181 
00182 
00183 #elif CV_NEON
00184 
00185 template <>
00186 struct Sum_SIMD<uchar, int>
00187 {
00188     int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const
00189     {
00190         if (mask || (cn != 1 && cn != 2 && cn != 4))
00191             return 0;
00192 
00193         int x = 0;
00194         uint32x4_t v_sum = vdupq_n_u32(0u);
00195 
00196         for ( ; x <= len - 16; x += 16)
00197         {
00198             uint8x16_t v_src = vld1q_u8(src0 + x);
00199             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
00200 
00201             v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
00202             v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
00203 
00204             v_half = vmovl_u8(vget_high_u8(v_src));
00205             v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
00206             v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
00207         }
00208 
00209         for ( ; x <= len - 8; x += 8)
00210         {
00211             uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));
00212 
00213             v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
00214             v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
00215         }
00216 
00217         unsigned int CV_DECL_ALIGNED(16) ar[4];
00218         vst1q_u32(ar, v_sum);
00219 
00220         for (int i = 0; i < 4; i += cn)
00221             for (int j = 0; j < cn; ++j)
00222                 dst[j] += ar[j + i];
00223 
00224         return x / cn;
00225     }
00226 };
00227 
00228 template <>
00229 struct Sum_SIMD<schar, int>
00230 {
00231     int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
00232     {
00233         if (mask || (cn != 1 && cn != 2 && cn != 4))
00234             return 0;
00235 
00236         int x = 0;
00237         int32x4_t v_sum = vdupq_n_s32(0);
00238 
00239         for ( ; x <= len - 16; x += 16)
00240         {
00241             int8x16_t v_src = vld1q_s8(src0 + x);
00242             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
00243 
00244             v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
00245             v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
00246 
00247             v_half = vmovl_s8(vget_high_s8(v_src));
00248             v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
00249             v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
00250         }
00251 
00252         for ( ; x <= len - 8; x += 8)
00253         {
00254             int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));
00255 
00256             v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
00257             v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
00258         }
00259 
00260         int CV_DECL_ALIGNED(16) ar[4];
00261         vst1q_s32(ar, v_sum);
00262 
00263         for (int i = 0; i < 4; i += cn)
00264             for (int j = 0; j < cn; ++j)
00265                 dst[j] += ar[j + i];
00266 
00267         return x / cn;
00268     }
00269 };
00270 
00271 template <>
00272 struct Sum_SIMD<ushort, int>
00273 {
00274     int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const
00275     {
00276         if (mask || (cn != 1 && cn != 2 && cn != 4))
00277             return 0;
00278 
00279         int x = 0;
00280         uint32x4_t v_sum = vdupq_n_u32(0u);
00281 
00282         for ( ; x <= len - 8; x += 8)
00283         {
00284             uint16x8_t v_src = vld1q_u16(src0 + x);
00285 
00286             v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
00287             v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
00288         }
00289 
00290         for ( ; x <= len - 4; x += 4)
00291             v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x));
00292 
00293         unsigned int CV_DECL_ALIGNED(16) ar[4];
00294         vst1q_u32(ar, v_sum);
00295 
00296         for (int i = 0; i < 4; i += cn)
00297             for (int j = 0; j < cn; ++j)
00298                 dst[j] += ar[j + i];
00299 
00300         return x / cn;
00301     }
00302 };
00303 
00304 template <>
00305 struct Sum_SIMD<short, int>
00306 {
00307     int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const
00308     {
00309         if (mask || (cn != 1 && cn != 2 && cn != 4))
00310             return 0;
00311 
00312         int x = 0;
00313         int32x4_t v_sum = vdupq_n_s32(0u);
00314 
00315         for ( ; x <= len - 8; x += 8)
00316         {
00317             int16x8_t v_src = vld1q_s16(src0 + x);
00318 
00319             v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
00320             v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
00321         }
00322 
00323         for ( ; x <= len - 4; x += 4)
00324             v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x));
00325 
00326         int CV_DECL_ALIGNED(16) ar[4];
00327         vst1q_s32(ar, v_sum);
00328 
00329         for (int i = 0; i < 4; i += cn)
00330             for (int j = 0; j < cn; ++j)
00331                 dst[j] += ar[j + i];
00332 
00333         return x / cn;
00334     }
00335 };
00336 
00337 #endif
00338 
00339 template<typename T, typename ST>
00340 static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn )
00341 {
00342     const T* src = src0;
00343     if( !mask )
00344     {
00345         Sum_SIMD<T, ST> vop;
00346         int i = vop(src0, mask, dst, len, cn), k = cn % 4;
00347         src += i * cn;
00348 
00349         if( k == 1 )
00350         {
00351             ST s0 = dst[0];
00352 
00353             #if CV_ENABLE_UNROLLED
00354             for(; i <= len - 4; i += 4, src += cn*4 )
00355                 s0 += src[0] + src[cn] + src[cn*2] + src[cn*3];
00356             #endif
00357             for( ; i < len; i++, src += cn )
00358                 s0 += src[0];
00359             dst[0] = s0;
00360         }
00361         else if( k == 2 )
00362         {
00363             ST s0 = dst[0], s1 = dst[1];
00364             for( ; i < len; i++, src += cn )
00365             {
00366                 s0 += src[0];
00367                 s1 += src[1];
00368             }
00369             dst[0] = s0;
00370             dst[1] = s1;
00371         }
00372         else if( k == 3 )
00373         {
00374             ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
00375             for( ; i < len; i++, src += cn )
00376             {
00377                 s0 += src[0];
00378                 s1 += src[1];
00379                 s2 += src[2];
00380             }
00381             dst[0] = s0;
00382             dst[1] = s1;
00383             dst[2] = s2;
00384         }
00385 
00386         for( ; k < cn; k += 4 )
00387         {
00388             src = src0 + i*cn + k;
00389             ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3];
00390             for( ; i < len; i++, src += cn )
00391             {
00392                 s0 += src[0]; s1 += src[1];
00393                 s2 += src[2]; s3 += src[3];
00394             }
00395             dst[k] = s0;
00396             dst[k+1] = s1;
00397             dst[k+2] = s2;
00398             dst[k+3] = s3;
00399         }
00400         return len;
00401     }
00402 
00403     int i, nzm = 0;
00404     if( cn == 1 )
00405     {
00406         ST s = dst[0];
00407         for( i = 0; i < len; i++ )
00408             if( mask[i] )
00409             {
00410                 s += src[i];
00411                 nzm++;
00412             }
00413         dst[0] = s;
00414     }
00415     else if( cn == 3 )
00416     {
00417         ST s0 = dst[0], s1 = dst[1], s2 = dst[2];
00418         for( i = 0; i < len; i++, src += 3 )
00419             if( mask[i] )
00420             {
00421                 s0 += src[0];
00422                 s1 += src[1];
00423                 s2 += src[2];
00424                 nzm++;
00425             }
00426         dst[0] = s0;
00427         dst[1] = s1;
00428         dst[2] = s2;
00429     }
00430     else
00431     {
00432         for( i = 0; i < len; i++, src += cn )
00433             if( mask[i] )
00434             {
00435                 int k = 0;
00436                 #if CV_ENABLE_UNROLLED
00437                 for( ; k <= cn - 4; k += 4 )
00438                 {
00439                     ST s0, s1;
00440                     s0 = dst[k] + src[k];
00441                     s1 = dst[k+1] + src[k+1];
00442                     dst[k] = s0; dst[k+1] = s1;
00443                     s0 = dst[k+2] + src[k+2];
00444                     s1 = dst[k+3] + src[k+3];
00445                     dst[k+2] = s0; dst[k+3] = s1;
00446                 }
00447                 #endif
00448                 for( ; k < cn; k++ )
00449                     dst[k] += src[k];
00450                 nzm++;
00451             }
00452     }
00453     return nzm;
00454 }
00455 
00456 
00457 static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn )
00458 { return sum_(src, mask, dst, len, cn); }
00459 
00460 static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn )
00461 { return sum_(src, mask, dst, len, cn); }
00462 
00463 static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn )
00464 { return sum_(src, mask, dst, len, cn); }
00465 
00466 static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn )
00467 { return sum_(src, mask, dst, len, cn); }
00468 
00469 static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn )
00470 { return sum_(src, mask, dst, len, cn); }
00471 
00472 static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn )
00473 { return sum_(src, mask, dst, len, cn); }
00474 
00475 static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn )
00476 { return sum_(src, mask, dst, len, cn); }
00477 
00478 typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int);
00479 
00480 static SumFunc getSumFunc(int depth)
00481 {
00482     static SumFunc sumTab[] =
00483     {
00484         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
00485         (SumFunc)sum16u, (SumFunc)sum16s,
00486         (SumFunc)sum32s,
00487         (SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f,
00488         0
00489     };
00490 
00491     return sumTab[depth];
00492 }
00493 
00494 template<typename T>
00495 static int countNonZero_(const T* src, int len )
00496 {
00497     int i=0, nz = 0;
00498     #if CV_ENABLE_UNROLLED
00499     for(; i <= len - 4; i += 4 )
00500         nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);
00501     #endif
00502     for( ; i < len; i++ )
00503         nz += src[i] != 0;
00504     return nz;
00505 }
00506 
00507 static int countNonZero8u( const uchar* src, int len )
00508 {
00509     int i=0, nz = 0;
00510 #if CV_SSE2
00511     if(USE_SSE2)//5x-6x
00512     {
00513         __m128i v_zero = _mm_setzero_si128();
00514         __m128i sum = _mm_setzero_si128();
00515 
00516         for (; i<=len-16; i+=16)
00517         {
00518             __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
00519             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi8(r0, v_zero)), v_zero));
00520         }
00521         nz = i - _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum)));
00522     }
00523 #elif CV_NEON
00524     int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
00525     uint32x4_t v_nz = vdupq_n_u32(0u);
00526     uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
00527     const uchar * src0 = src;
00528 
00529     while( i < len0 )
00530     {
00531         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
00532 
00533         while (j < blockSizei)
00534         {
00535             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
00536             uint8x16_t v_pz = v_zero;
00537 
00538             for( ; k <= blockSizej - 16; k += 16 )
00539                 v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));
00540 
00541             uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
00542             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
00543             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);
00544 
00545             src0 += blockSizej;
00546             j += blockSizej;
00547         }
00548 
00549         i += blockSizei;
00550     }
00551 
00552     CV_DECL_ALIGNED(16) unsigned int buf[4];
00553     vst1q_u32(buf, v_nz);
00554     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
00555 #endif
00556     for( ; i < len; i++ )
00557         nz += src[i] != 0;
00558     return nz;
00559 }
00560 
00561 static int countNonZero16u( const ushort* src, int len )
00562 {
00563     int i = 0, nz = 0;
00564 #if CV_SSE2
00565     if (USE_SSE2)
00566     {
00567         __m128i v_zero = _mm_setzero_si128 ();
00568         __m128i sum = _mm_setzero_si128();
00569 
00570         for ( ; i <= len - 8; i += 8)
00571         {
00572             __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
00573             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi16(r0, v_zero)), v_zero));
00574         }
00575 
00576         nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 1);
00577         src += i;
00578     }
00579 #elif CV_NEON
00580     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
00581     uint32x4_t v_nz = vdupq_n_u32(0u);
00582     uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
00583 
00584     while( i < len0 )
00585     {
00586         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
00587 
00588         while (j < blockSizei)
00589         {
00590             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
00591             uint16x8_t v_pz = v_zero;
00592 
00593             for( ; k <= blockSizej - 8; k += 8 )
00594                 v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));
00595 
00596             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
00597 
00598             src += blockSizej;
00599             j += blockSizej;
00600         }
00601 
00602         i += blockSizei;
00603     }
00604 
00605     CV_DECL_ALIGNED(16) unsigned int buf[4];
00606     vst1q_u32(buf, v_nz);
00607     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
00608 #endif
00609     return nz + countNonZero_(src, len - i);
00610 }
00611 
00612 static int countNonZero32s( const int* src, int len )
00613 {
00614     int i = 0, nz = 0;
00615 #if CV_SSE2
00616     if (USE_SSE2)
00617     {
00618         __m128i v_zero = _mm_setzero_si128 ();
00619         __m128i sum = _mm_setzero_si128();
00620 
00621         for ( ; i <= len - 4; i += 4)
00622         {
00623             __m128i r0 = _mm_loadu_si128((const __m128i*)(src + i));
00624             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi32(r0, v_zero)), v_zero));
00625         }
00626 
00627         nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
00628         src += i;
00629     }
00630 #elif CV_NEON
00631     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
00632     uint32x4_t v_nz = vdupq_n_u32(0u);
00633     int32x4_t v_zero = vdupq_n_s32(0.0f);
00634     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
00635 
00636     while( i < len0 )
00637     {
00638         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
00639 
00640         while (j < blockSizei)
00641         {
00642             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
00643             uint16x8_t v_pz = v_zerou;
00644 
00645             for( ; k <= blockSizej - 8; k += 8 )
00646                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
00647                                                               vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));
00648 
00649             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
00650 
00651             src += blockSizej;
00652             j += blockSizej;
00653         }
00654 
00655         i += blockSizei;
00656     }
00657 
00658     CV_DECL_ALIGNED(16) unsigned int buf[4];
00659     vst1q_u32(buf, v_nz);
00660     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
00661 #endif
00662     return nz + countNonZero_(src, len - i);
00663 }
00664 
00665 static int countNonZero32f( const float* src, int len )
00666 {
00667     int i = 0, nz = 0;
00668 #if CV_SSE2
00669     if (USE_SSE2)
00670     {
00671         __m128 v_zero_f = _mm_setzero_ps();
00672         __m128i v_zero = _mm_setzero_si128 ();
00673         __m128i sum = _mm_setzero_si128();
00674 
00675         for ( ; i <= len - 4; i += 4)
00676         {
00677             __m128 r0 = _mm_loadu_ps(src + i);
00678             sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_castps_si128(_mm_cmpeq_ps(r0, v_zero_f))), v_zero));
00679         }
00680 
00681         nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2);
00682         src += i;
00683     }
00684 #elif CV_NEON
00685     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
00686     uint32x4_t v_nz = vdupq_n_u32(0u);
00687     float32x4_t v_zero = vdupq_n_f32(0.0f);
00688     uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
00689 
00690     while( i < len0 )
00691     {
00692         int blockSizei = std::min(len0 - i, blockSize0), j = 0;
00693 
00694         while (j < blockSizei)
00695         {
00696             int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
00697             uint16x8_t v_pz = v_zerou;
00698 
00699             for( ; k <= blockSizej - 8; k += 8 )
00700                 v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
00701                                                               vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));
00702 
00703             v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
00704 
00705             src += blockSizej;
00706             j += blockSizej;
00707         }
00708 
00709         i += blockSizei;
00710     }
00711 
00712     CV_DECL_ALIGNED(16) unsigned int buf[4];
00713     vst1q_u32(buf, v_nz);
00714     nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
00715 #endif
00716     return nz + countNonZero_(src, len - i);
00717 }
00718 
00719 static int countNonZero64f( const double* src, int len )
00720 {
00721     return countNonZero_(src, len);
00722 }
00723 
00724 typedef int (*CountNonZeroFunc)(const uchar*, int);
00725 
00726 static CountNonZeroFunc getCountNonZeroTab(int depth)
00727 {
00728     static CountNonZeroFunc countNonZeroTab[] =
00729     {
00730         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
00731         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
00732         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),
00733         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0
00734     };
00735 
00736     return countNonZeroTab[depth];
00737 }
00738 
00739 template <typename T, typename ST, typename SQT>
00740 struct SumSqr_SIMD
00741 {
00742     int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
00743     {
00744         return 0;
00745     }
00746 };
00747 
00748 #if CV_SSE2
00749 
00750 template <>
00751 struct SumSqr_SIMD<uchar, int, int>
00752 {
00753     int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
00754     {
00755         if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
00756             return 0;
00757 
00758         int x = 0;
00759         __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
00760 
00761         for ( ; x <= len - 16; x += 16)
00762         {
00763             __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
00764             __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero);
00765 
00766             __m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
00767             __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
00768             v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
00769             v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
00770             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
00771             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
00772 
00773             v_half = _mm_unpackhi_epi8(v_src, v_zero);
00774             v_mullo = _mm_mullo_epi16(v_half, v_half);
00775             v_mulhi = _mm_mulhi_epi16(v_half, v_half);
00776             v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
00777             v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
00778             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
00779             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
00780         }
00781 
00782         for ( ; x <= len - 8; x += 8)
00783         {
00784             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
00785 
00786             __m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
00787             __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
00788             v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero));
00789             v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero));
00790             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
00791             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
00792         }
00793 
00794         int CV_DECL_ALIGNED(16) ar[8];
00795         _mm_store_si128((__m128i*)ar, v_sum);
00796         _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
00797 
00798         for (int i = 0; i < 4; i += cn)
00799             for (int j = 0; j < cn; ++j)
00800             {
00801                 sum[j] += ar[j + i];
00802                 sqsum[j] += ar[4 + j + i];
00803             }
00804 
00805         return x / cn;
00806     }
00807 };
00808 
00809 template <>
00810 struct SumSqr_SIMD<schar, int, int>
00811 {
00812     int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
00813     {
00814         if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
00815             return 0;
00816 
00817         int x = 0;
00818         __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
00819 
00820         for ( ; x <= len - 16; x += 16)
00821         {
00822             __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
00823             __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
00824 
00825             __m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
00826             __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
00827             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
00828             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
00829             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
00830             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
00831 
00832             v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
00833             v_mullo = _mm_mullo_epi16(v_half, v_half);
00834             v_mulhi = _mm_mulhi_epi16(v_half, v_half);
00835             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
00836             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
00837             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
00838             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
00839         }
00840 
00841         for ( ; x <= len - 8; x += 8)
00842         {
00843             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
00844 
00845             __m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
00846             __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
00847             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
00848             v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
00849             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
00850             v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
00851         }
00852 
00853         int CV_DECL_ALIGNED(16) ar[8];
00854         _mm_store_si128((__m128i*)ar, v_sum);
00855         _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
00856 
00857         for (int i = 0; i < 4; i += cn)
00858             for (int j = 0; j < cn; ++j)
00859             {
00860                 sum[j] += ar[j + i];
00861                 sqsum[j] += ar[4 + j + i];
00862             }
00863 
00864         return x / cn;
00865     }
00866 };
00867 
00868 #endif
00869 
00870 template<typename T, typename ST, typename SQT>
00871 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
00872 {
00873     const T* src = src0;
00874 
00875     if( !mask )
00876     {
00877         SumSqr_SIMD<T, ST, SQT> vop;
00878         int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
00879         src += i * cn;
00880 
00881         if( k == 1 )
00882         {
00883             ST s0 = sum[0];
00884             SQT sq0 = sqsum[0];
00885             for( ; i < len; i++, src += cn )
00886             {
00887                 T v = src[0];
00888                 s0 += v; sq0 += (SQT)v*v;
00889             }
00890             sum[0] = s0;
00891             sqsum[0] = sq0;
00892         }
00893         else if( k == 2 )
00894         {
00895             ST s0 = sum[0], s1 = sum[1];
00896             SQT sq0 = sqsum[0], sq1 = sqsum[1];
00897             for( ; i < len; i++, src += cn )
00898             {
00899                 T v0 = src[0], v1 = src[1];
00900                 s0 += v0; sq0 += (SQT)v0*v0;
00901                 s1 += v1; sq1 += (SQT)v1*v1;
00902             }
00903             sum[0] = s0; sum[1] = s1;
00904             sqsum[0] = sq0; sqsum[1] = sq1;
00905         }
00906         else if( k == 3 )
00907         {
00908             ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
00909             SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
00910             for( ; i < len; i++, src += cn )
00911             {
00912                 T v0 = src[0], v1 = src[1], v2 = src[2];
00913                 s0 += v0; sq0 += (SQT)v0*v0;
00914                 s1 += v1; sq1 += (SQT)v1*v1;
00915                 s2 += v2; sq2 += (SQT)v2*v2;
00916             }
00917             sum[0] = s0; sum[1] = s1; sum[2] = s2;
00918             sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
00919         }
00920 
00921         for( ; k < cn; k += 4 )
00922         {
00923             src = src0 + k;
00924             ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
00925             SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
00926             for( ; i < len; i++, src += cn )
00927             {
00928                 T v0, v1;
00929                 v0 = src[0], v1 = src[1];
00930                 s0 += v0; sq0 += (SQT)v0*v0;
00931                 s1 += v1; sq1 += (SQT)v1*v1;
00932                 v0 = src[2], v1 = src[3];
00933                 s2 += v0; sq2 += (SQT)v0*v0;
00934                 s3 += v1; sq3 += (SQT)v1*v1;
00935             }
00936             sum[k] = s0; sum[k+1] = s1;
00937             sum[k+2] = s2; sum[k+3] = s3;
00938             sqsum[k] = sq0; sqsum[k+1] = sq1;
00939             sqsum[k+2] = sq2; sqsum[k+3] = sq3;
00940         }
00941         return len;
00942     }
00943 
00944     int i, nzm = 0;
00945 
00946     if( cn == 1 )
00947     {
00948         ST s0 = sum[0];
00949         SQT sq0 = sqsum[0];
00950         for( i = 0; i < len; i++ )
00951             if( mask[i] )
00952             {
00953                 T v = src[i];
00954                 s0 += v; sq0 += (SQT)v*v;
00955                 nzm++;
00956             }
00957         sum[0] = s0;
00958         sqsum[0] = sq0;
00959     }
00960     else if( cn == 3 )
00961     {
00962         ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
00963         SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
00964         for( i = 0; i < len; i++, src += 3 )
00965             if( mask[i] )
00966             {
00967                 T v0 = src[0], v1 = src[1], v2 = src[2];
00968                 s0 += v0; sq0 += (SQT)v0*v0;
00969                 s1 += v1; sq1 += (SQT)v1*v1;
00970                 s2 += v2; sq2 += (SQT)v2*v2;
00971                 nzm++;
00972             }
00973         sum[0] = s0; sum[1] = s1; sum[2] = s2;
00974         sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2;
00975     }
00976     else
00977     {
00978         for( i = 0; i < len; i++, src += cn )
00979             if( mask[i] )
00980             {
00981                 for( int k = 0; k < cn; k++ )
00982                 {
00983                     T v = src[k];
00984                     ST s = sum[k] + v;
00985                     SQT sq = sqsum[k] + (SQT)v*v;
00986                     sum[k] = s; sqsum[k] = sq;
00987                 }
00988                 nzm++;
00989             }
00990     }
00991     return nzm;
00992 }
00993 
00994 
00995 static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
00996 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
00997 
00998 static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn )
00999 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
01000 
01001 static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
01002 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
01003 
01004 static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn )
01005 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
01006 
01007 static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
01008 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
01009 
01010 static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
01011 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
01012 
01013 static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn )
01014 { return sumsqr_(src, mask, sum, sqsum, len, cn); }
01015 
01016 typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int);
01017 
01018 static SumSqrFunc getSumSqrTab(int depth)
01019 {
01020     static SumSqrFunc sumSqrTab[] =
01021     {
01022         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
01023         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
01024     };
01025 
01026     return sumSqrTab[depth];
01027 }
01028 
01029 #ifdef HAVE_OPENCL
01030 
01031 template <typename T> Scalar ocl_part_sum(Mat m)
01032 {
01033     CV_Assert(m.rows == 1);
01034 
01035     Scalar s = Scalar::all(0);
01036     int cn = m.channels();
01037     const T * const ptr = m.ptr<T>(0);
01038 
01039     for (int x = 0, w = m.cols * cn; x < w; )
01040         for (int c = 0; c < cn; ++c, ++x)
01041             s[c] += ptr[x];
01042 
01043     return s;
01044 }
01045 
01046 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
01047 
01048 static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
01049                      InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
01050 {
01051     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
01052 
01053     const ocl::Device & dev = ocl::Device::getDefault();
01054     bool doubleSupport = dev.doubleFPConfig() > 0,
01055         haveMask = _mask.kind() != _InputArray::NONE,
01056         haveSrc2 = _src2.kind() != _InputArray::NONE;
01057     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
01058             kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
01059             mcn = std::max(cn, kercn);
01060     CV_Assert(!haveSrc2 || _src2.type() == type);
01061     int convert_cn = haveSrc2 ? mcn : cn;
01062 
01063     if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
01064         return false;
01065 
01066     int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
01067     size_t wgs = dev.maxWorkGroupSize();
01068 
01069     int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
01070             dtype = CV_MAKE_TYPE(ddepth, cn);
01071     CV_Assert(!haveMask || _mask.type() == CV_8UC1);
01072 
01073     int wgs2_aligned = 1;
01074     while (wgs2_aligned < (int)wgs)
01075         wgs2_aligned <<= 1;
01076     wgs2_aligned >>= 1;
01077 
01078     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
01079     char cvt[2][40];
01080     String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
01081                          " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
01082                          ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
01083                          ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
01084                          ocl::typeToStr(ddepth), ddepth, cn,
01085                          ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]),
01086                          opMap[sum_op], (int)wgs, wgs2_aligned,
01087                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
01088                          haveMask ? " -D HAVE_MASK" : "",
01089                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
01090                          haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
01091                          haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
01092                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
01093                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");
01094 
01095     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
01096     if (k.empty())
01097         return false;
01098 
01099     UMat src = _src.getUMat(), src2 = _src2.getUMat(),
01100         db(1, dbsize, dtype), mask = _mask.getUMat();
01101 
01102     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
01103             dbarg = ocl::KernelArg::PtrWriteOnly(db),
01104             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
01105             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
01106 
01107     if (haveMask)
01108     {
01109         if (haveSrc2)
01110             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
01111         else
01112             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
01113     }
01114     else
01115     {
01116         if (haveSrc2)
01117             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
01118         else
01119             k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
01120     }
01121 
01122     size_t globalsize = ngroups * wgs;
01123     if (k.run(1, &globalsize, &wgs, false))
01124     {
01125         typedef Scalar (*part_sum)(Mat m);
01126         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
01127                 func = funcs[ddepth - CV_32S];
01128 
01129         Mat mres = db.getMat(ACCESS_READ);
01130         if (calc2)
01131             const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize));
01132 
01133         res = func(mres.colRange(0, ngroups));
01134         return true;
01135     }
01136     return false;
01137 }
01138 
01139 #endif
01140 
01141 #ifdef HAVE_IPP
01142 static bool ipp_sum(Mat &src, Scalar &_res)
01143 {
01144 #if IPP_VERSION_X100 >= 700
01145     int cn = src.channels();
01146     size_t total_size = src.total();
01147     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
01148     if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
01149     {
01150         IppiSize sz = { cols, rows };
01151         int type = src.type();
01152         typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
01153         typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
01154         ippiSumFuncHint ippFuncHint =
01155             type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
01156             type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
01157             type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
01158             0;
01159         ippiSumFuncNoHint ippFuncNoHint =
01160             type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
01161             type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
01162             type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
01163             type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
01164             type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
01165             type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
01166             type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
01167             type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
01168             type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
01169             0;
01170         CV_Assert(!ippFuncHint || !ippFuncNoHint);
01171         if( ippFuncHint || ippFuncNoHint )
01172         {
01173             Ipp64f res[4];
01174             IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
01175                             ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
01176             if( ret >= 0 )
01177             {
01178                 for( int i = 0; i < cn; i++ )
01179                     _res[i] = res[i];
01180                 return true;
01181             }
01182         }
01183     }
01184 #else
01185     CV_UNUSED(src); CV_UNUSED(_res);
01186 #endif
01187     return false;
01188 }
01189 #endif
01190 
01191 }
01192 
01193 cv::Scalar  cv::sum( InputArray _src )
01194 {
01195 #if defined HAVE_OPENCL || defined HAVE_IPP
01196     Scalar _res;
01197 #endif
01198 
01199 #ifdef HAVE_OPENCL
01200     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
01201                 ocl_sum(_src, _res, OCL_OP_SUM),
01202                 _res)
01203 #endif
01204 
01205     Mat src = _src.getMat();
01206     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res);
01207 
01208     int k, cn = src.channels(), depth = src.depth();
01209     SumFunc func = getSumFunc(depth);
01210     CV_Assert( cn <= 4 && func != 0 );
01211 
01212     const Mat* arrays[] = {&src, 0};
01213     uchar* ptrs[1];
01214     NAryMatIterator it(arrays, ptrs);
01215     Scalar s;
01216     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
01217     int j, count = 0;
01218     AutoBuffer<int> _buf;
01219     int* buf = (int*)&s[0];
01220     size_t esz = 0;
01221     bool blockSum = depth < CV_32S;
01222 
01223     if( blockSum )
01224     {
01225         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
01226         blockSize = std::min(blockSize, intSumBlockSize);
01227         _buf.allocate(cn);
01228         buf = _buf;
01229 
01230         for( k = 0; k < cn; k++ )
01231             buf[k] = 0;
01232         esz = src.elemSize();
01233     }
01234 
01235     for( size_t i = 0; i < it.nplanes; i++, ++it )
01236     {
01237         for( j = 0; j < total; j += blockSize )
01238         {
01239             int bsz = std::min(total - j, blockSize);
01240             func( ptrs[0], 0, (uchar*)buf, bsz, cn );
01241             count += bsz;
01242             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
01243             {
01244                 for( k = 0; k < cn; k++ )
01245                 {
01246                     s[k] += buf[k];
01247                     buf[k] = 0;
01248                 }
01249                 count = 0;
01250             }
01251             ptrs[0] += bsz*esz;
01252         }
01253     }
01254     return s;
01255 }
01256 
01257 #ifdef HAVE_OPENCL
01258 
01259 namespace cv {
01260 
01261 static bool ocl_countNonZero( InputArray _src, int & res )
01262 {
01263     int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
01264     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
01265 
01266     if (depth == CV_64F && !doubleSupport)
01267         return false;
01268 
01269     int dbsize = ocl::Device::getDefault().maxComputeUnits();
01270     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
01271 
01272     int wgs2_aligned = 1;
01273     while (wgs2_aligned < (int)wgs)
01274         wgs2_aligned <<= 1;
01275     wgs2_aligned >>= 1;
01276 
01277     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
01278                   format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
01279                          " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
01280                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
01281                          ocl::typeToStr(depth), (int)wgs, kercn,
01282                          wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
01283                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
01284     if (k.empty())
01285         return false;
01286 
01287     UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);
01288     k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
01289            dbsize, ocl::KernelArg::PtrWriteOnly(db));
01290 
01291     size_t globalsize = dbsize * wgs;
01292     if (k.run(1, &globalsize, &wgs, true))
01293         return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;
01294     return false;
01295 }
01296 
01297 }
01298 
01299 #endif
01300 
01301 #if defined HAVE_IPP
01302 namespace cv {
01303 
01304 static bool ipp_countNonZero( Mat &src, int &res )
01305 {
01306 #if !defined HAVE_IPP_ICV_ONLY
01307     Ipp32s count = 0;
01308     IppStatus status = ippStsNoErr;
01309 
01310     int type = src.type(), depth = CV_MAT_DEPTH(type);
01311     IppiSize roiSize = { src.cols, src.rows };
01312     Ipp32s srcstep = (Ipp32s)src.step;
01313     if (src.isContinuous())
01314     {
01315         roiSize.width = (Ipp32s)src.total();
01316         roiSize.height = 1;
01317         srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type);
01318     }
01319 
01320     if (depth == CV_8U)
01321         status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0);
01322     else if (depth == CV_32F)
01323         status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0);
01324 
01325     if (status >= 0)
01326     {
01327         res = ((Ipp32s)src.total() - count);
01328         return true;
01329     }
01330 #else
01331     CV_UNUSED(src); CV_UNUSED(res);
01332 #endif
01333     return false;
01334 }
01335 }
01336 #endif
01337 
01338 
01339 int cv::countNonZero( InputArray _src )
01340 {
01341     int type = _src.type(), cn = CV_MAT_CN(type);
01342     CV_Assert( cn == 1 );
01343 
01344 #if defined HAVE_OPENCL || defined HAVE_IPP
01345     int res = -1;
01346 #endif
01347 
01348 #ifdef HAVE_OPENCL
01349     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
01350                 ocl_countNonZero(_src, res),
01351                 res)
01352 #endif
01353 
01354     Mat src = _src.getMat();
01355     CV_IPP_RUN(0 && (_src.dims() <= 2 || _src.isContinuous()), ipp_countNonZero(src, res), res);
01356 
01357     CountNonZeroFunc func = getCountNonZeroTab(src.depth());
01358     CV_Assert( func != 0 );
01359 
01360     const Mat* arrays[] = {&src, 0};
01361     uchar* ptrs[1];
01362     NAryMatIterator it(arrays, ptrs);
01363     int total = (int)it.size, nz = 0;
01364 
01365     for( size_t i = 0; i < it.nplanes; i++, ++it )
01366         nz += func( ptrs[0], total );
01367 
01368     return nz;
01369 }
01370 
01371 #if defined HAVE_IPP
01372 namespace cv
01373 {
01374 static bool ipp_mean( Mat &src, Mat &mask, Scalar &ret )
01375 {
01376 #if IPP_VERSION_X100 >= 700
01377     size_t total_size = src.total();
01378     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
01379     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
01380     {
01381         IppiSize sz = { cols, rows };
01382         int type = src.type();
01383         if( !mask.empty() )
01384         {
01385             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
01386             ippiMaskMeanFuncC1 ippFuncC1 =
01387             type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
01388             type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
01389             type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR :
01390             0;
01391             if( ippFuncC1 )
01392             {
01393                 Ipp64f res;
01394                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 )
01395                 {
01396                     ret = Scalar(res);
01397                     return true;
01398                 }
01399             }
01400             typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
01401             ippiMaskMeanFuncC3 ippFuncC3 =
01402             type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
01403             type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
01404             type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR :
01405             0;
01406             if( ippFuncC3 )
01407             {
01408                 Ipp64f res1, res2, res3;
01409                 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 &&
01410                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 &&
01411                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 )
01412                 {
01413                     ret = Scalar(res1, res2, res3);
01414                     return true;
01415                 }
01416             }
01417         }
01418         else
01419         {
01420             typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
01421             typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
01422             ippiMeanFuncHint ippFuncHint =
01423                 type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
01424                 type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
01425                 type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
01426                 0;
01427             ippiMeanFuncNoHint ippFuncNoHint =
01428                 type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
01429                 type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
01430                 type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
01431                 type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
01432                 type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
01433                 type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
01434                 type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
01435                 type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
01436                 type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
01437                 0;
01438             // Make sure only zero or one version of the function pointer is valid
01439             CV_Assert(!ippFuncHint || !ippFuncNoHint);
01440             if( ippFuncHint || ippFuncNoHint )
01441             {
01442                 Ipp64f res[4];
01443                 IppStatus status = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
01444                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, res);
01445                 if( status >= 0 )
01446                 {
01447                     for( int i = 0; i < src.channels(); i++ )
01448                         ret[i] = res[i];
01449                     return true;
01450                 }
01451             }
01452         }
01453     }
01454     return false;
01455 #else
01456     return false;
01457 #endif
01458 }
01459 }
01460 #endif
01461 
01462 cv::Scalar  cv::mean( InputArray _src, InputArray _mask )
01463 {
01464     Mat src = _src.getMat(), mask = _mask.getMat();
01465     CV_Assert( mask.empty() || mask.type() == CV_8U );
01466 
01467     int k, cn = src.channels(), depth = src.depth();
01468     Scalar  s;
01469 
01470     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_mean(src, mask, s), s)
01471 
01472     SumFunc func = getSumFunc(depth);
01473 
01474     CV_Assert( cn <= 4 && func != 0 );
01475 
01476     const Mat* arrays[] = {&src, &mask, 0};
01477     uchar* ptrs[2];
01478     NAryMatIterator it(arrays, ptrs);
01479     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
01480     int j, count = 0;
01481     AutoBuffer<int>  _buf;
01482     int* buf = (int*)&s[0];
01483     bool blockSum = depth <= CV_16S;
01484     size_t esz = 0, nz0 = 0;
01485 
01486     if( blockSum )
01487     {
01488         intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15);
01489         blockSize = std::min(blockSize, intSumBlockSize);
01490         _buf.allocate(cn);
01491         buf = _buf;
01492 
01493         for( k = 0; k < cn; k++ )
01494             buf[k] = 0;
01495         esz = src.elemSize();
01496     }
01497 
01498     for( size_t i = 0; i < it.nplanes; i++, ++it )
01499     {
01500         for( j = 0; j < total; j += blockSize )
01501         {
01502             int bsz = std::min(total - j, blockSize);
01503             int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn );
01504             count += nz;
01505             nz0 += nz;
01506             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
01507             {
01508                 for( k = 0; k < cn; k++ )
01509                 {
01510                     s[k] += buf[k];
01511                     buf[k] = 0;
01512                 }
01513                 count = 0;
01514             }
01515             ptrs[0] += bsz*esz;
01516             if( ptrs[1] )
01517                 ptrs[1] += bsz;
01518         }
01519     }
01520     return s*(nz0 ? 1./nz0 : 0);
01521 }
01522 
01523 #ifdef HAVE_OPENCL
01524 
01525 namespace cv {
01526 
01527 static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
01528 {
01529     bool haveMask = _mask.kind() != _InputArray::NONE;
01530     int nz = haveMask ? -1 : (int)_src.total();
01531     Scalar mean, stddev;
01532 
01533     {
01534         int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
01535         bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
01536                 isContinuous = _src.isContinuous(),
01537                 isMaskContinuous = _mask.isContinuous();
01538         const ocl::Device &defDev = ocl::Device::getDefault();
01539         int groups = defDev.maxComputeUnits();
01540         if (defDev.isIntel())
01541         {
01542             static const int subSliceEUCount = 10;
01543             groups = (groups / subSliceEUCount) * 2;
01544         }
01545         size_t wgs = defDev.maxWorkGroupSize();
01546 
01547         int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
01548                 dtype = CV_MAKE_TYPE(ddepth, cn),
01549                 sqdtype = CV_MAKETYPE(sqddepth, cn);
01550         CV_Assert(!haveMask || _mask.type() == CV_8UC1);
01551 
01552         int wgs2_aligned = 1;
01553         while (wgs2_aligned < (int)wgs)
01554             wgs2_aligned <<= 1;
01555         wgs2_aligned >>= 1;
01556 
01557         if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
01558             return false;
01559 
01560         char cvt[2][40];
01561         String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
01562                              " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
01563                              " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
01564                              ocl::typeToStr(type), ocl::typeToStr(depth),
01565                              ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
01566                              ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
01567                              ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
01568                              cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
01569                              isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
01570                              ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
01571                              (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
01572                              doubleSupport ? " -D DOUBLE_SUPPORT" : "");
01573 
01574         ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts);
01575         if (k.empty())
01576             return false;
01577 
01578         int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) +
01579                                CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype));
01580         UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
01581 
01582         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
01583                 dbarg = ocl::KernelArg::PtrWriteOnly(db),
01584                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
01585 
01586         if (haveMask)
01587             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg);
01588         else
01589             k.args(srcarg, src.cols, (int)src.total(), groups, dbarg);
01590 
01591         size_t globalsize = groups * wgs;
01592         if (!k.run(1, &globalsize, &wgs, false))
01593             return false;
01594 
01595         typedef Scalar (* part_sum)(Mat m);
01596         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> };
01597         Mat dbm = db.getMat(ACCESS_READ);
01598 
01599         mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr()));
01600         stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype)));
01601 
01602         if (haveMask)
01603             nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() +
01604                                                  groups * (CV_ELEM_SIZE(dtype) +
01605                                                            CV_ELEM_SIZE(sqdtype))))[0]);
01606     }
01607 
01608     double total = nz != 0 ? 1.0 / nz : 0;
01609     int k, j, cn = _src.channels();
01610     for (int i = 0; i < cn; ++i)
01611     {
01612         mean[i] *= total;
01613         stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.));
01614     }
01615 
01616     for( j = 0; j < 2; j++ )
01617     {
01618         const double * const sptr = j == 0 ? &mean[0] : &stddev[0];
01619         _OutputArray _dst = j == 0 ? _mean : _sdv;
01620         if( !_dst.needed() )
01621             continue;
01622 
01623         if( !_dst.fixedSize() )
01624             _dst.create(cn, 1, CV_64F, -1, true);
01625         Mat dst = _dst.getMat();
01626         int dcn = (int)dst.total();
01627         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
01628                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
01629         double* dptr = dst.ptr<double>();
01630         for( k = 0; k < cn; k++ )
01631             dptr[k] = sptr[k];
01632         for( ; k < dcn; k++ )
01633             dptr[k] = 0;
01634     }
01635 
01636     return true;
01637 }
01638 
01639 }
01640 
01641 #endif
01642 
01643 #ifdef HAVE_IPP
01644 namespace cv
01645 {
01646 static bool ipp_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask)
01647 {
01648 #if IPP_VERSION_X100 >= 700
01649     int cn = src.channels();
01650     size_t total_size = src.total();
01651     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
01652     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
01653     {
01654         Ipp64f mean_temp[3];
01655         Ipp64f stddev_temp[3];
01656         Ipp64f *pmean = &mean_temp[0];
01657         Ipp64f *pstddev = &stddev_temp[0];
01658         Mat mean, stddev;
01659         int dcn_mean = -1;
01660         if( _mean.needed() )
01661         {
01662             if( !_mean.fixedSize() )
01663                 _mean.create(cn, 1, CV_64F, -1, true);
01664             mean = _mean.getMat();
01665             dcn_mean = (int)mean.total();
01666             pmean = mean.ptr<Ipp64f>();
01667         }
01668         int dcn_stddev = -1;
01669         if( _sdv.needed() )
01670         {
01671             if( !_sdv.fixedSize() )
01672                 _sdv.create(cn, 1, CV_64F, -1, true);
01673             stddev = _sdv.getMat();
01674             dcn_stddev = (int)stddev.total();
01675             pstddev = stddev.ptr<Ipp64f>();
01676         }
01677         for( int c = cn; c < dcn_mean; c++ )
01678             pmean[c] = 0;
01679         for( int c = cn; c < dcn_stddev; c++ )
01680             pstddev[c] = 0;
01681         IppiSize sz = { cols, rows };
01682         int type = src.type();
01683         if( !mask.empty() )
01684         {
01685             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
01686             ippiMaskMeanStdDevFuncC1 ippFuncC1 =
01687             type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
01688             type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
01689             type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR :
01690             0;
01691             if( ippFuncC1 )
01692             {
01693                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 )
01694                 {
01695                     return true;
01696                 }
01697             }
01698             typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
01699             ippiMaskMeanStdDevFuncC3 ippFuncC3 =
01700             type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
01701             type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
01702             type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR :
01703             0;
01704             if( ippFuncC3 )
01705             {
01706                 if( ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
01707                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
01708                     ippFuncC3(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
01709                 {
01710                     return true;
01711                 }
01712             }
01713         }
01714         else
01715         {
01716             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *);
01717             ippiMeanStdDevFuncC1 ippFuncC1 =
01718             type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R :
01719             type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R :
01720 #if (IPP_VERSION_X100 >= 810)
01721             type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
01722 #endif
01723             0;
01724             if( ippFuncC1 )
01725             {
01726                 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 )
01727                 {
01728                     return true;
01729                 }
01730             }
01731             typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
01732             ippiMeanStdDevFuncC3 ippFuncC3 =
01733             type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR :
01734             type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR :
01735             type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR :
01736             0;
01737             if( ippFuncC3 )
01738             {
01739                 if( ippFuncC3(src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 &&
01740                     ippFuncC3(src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 &&
01741                     ippFuncC3(src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 )
01742                 {
01743                     return true;
01744                 }
01745             }
01746         }
01747     }
01748 #else
01749     CV_UNUSED(src); CV_UNUSED(_mean); CV_UNUSED(_sdv); CV_UNUSED(mask);
01750 #endif
01751     return false;
01752 }
01753 }
01754 #endif
01755 
01756 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
01757 {
01758 #ifdef HAVE_OPENCL
01759     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
01760                ocl_meanStdDev(_src, _mean, _sdv, _mask))
01761 #endif
01762 
01763     Mat src = _src.getMat(), mask = _mask.getMat();
01764     CV_Assert( mask.empty() || mask.type() == CV_8UC1 );
01765 
01766     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_meanStdDev(src, _mean, _sdv, mask));
01767 
01768     int k, cn = src.channels(), depth = src.depth();
01769 
01770     SumSqrFunc func = getSumSqrTab(depth);
01771 
01772     CV_Assert( func != 0 );
01773 
01774     const Mat* arrays[] = {&src, &mask, 0};
01775     uchar* ptrs[2];
01776     NAryMatIterator it(arrays, ptrs);
01777     int total = (int)it.size, blockSize = total, intSumBlockSize = 0;
01778     int j, count = 0, nz0 = 0;
01779     AutoBuffer<double> _buf(cn*4);
01780     double *s = (double*)_buf, *sq = s + cn;
01781     int *sbuf = (int*)s, *sqbuf = (int*)sq;
01782     bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S;
01783     size_t esz = 0;
01784 
01785     for( k = 0; k < cn; k++ )
01786         s[k] = sq[k] = 0;
01787 
01788     if( blockSum )
01789     {
01790         intSumBlockSize = 1 << 15;
01791         blockSize = std::min(blockSize, intSumBlockSize);
01792         sbuf = (int*)(sq + cn);
01793         if( blockSqSum )
01794             sqbuf = sbuf + cn;
01795         for( k = 0; k < cn; k++ )
01796             sbuf[k] = sqbuf[k] = 0;
01797         esz = src.elemSize();
01798     }
01799 
01800     for( size_t i = 0; i < it.nplanes; i++, ++it )
01801     {
01802         for( j = 0; j < total; j += blockSize )
01803         {
01804             int bsz = std::min(total - j, blockSize);
01805             int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn );
01806             count += nz;
01807             nz0 += nz;
01808             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
01809             {
01810                 for( k = 0; k < cn; k++ )
01811                 {
01812                     s[k] += sbuf[k];
01813                     sbuf[k] = 0;
01814                 }
01815                 if( blockSqSum )
01816                 {
01817                     for( k = 0; k < cn; k++ )
01818                     {
01819                         sq[k] += sqbuf[k];
01820                         sqbuf[k] = 0;
01821                     }
01822                 }
01823                 count = 0;
01824             }
01825             ptrs[0] += bsz*esz;
01826             if( ptrs[1] )
01827                 ptrs[1] += bsz;
01828         }
01829     }
01830 
01831     double scale = nz0 ? 1./nz0 : 0.;
01832     for( k = 0; k < cn; k++ )
01833     {
01834         s[k] *= scale;
01835         sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
01836     }
01837 
01838     for( j = 0; j < 2; j++ )
01839     {
01840         const double* sptr = j == 0 ? s : sq;
01841         _OutputArray _dst = j == 0 ? _mean : _sdv;
01842         if( !_dst.needed() )
01843             continue;
01844 
01845         if( !_dst.fixedSize() )
01846             _dst.create(cn, 1, CV_64F, -1, true);
01847         Mat dst = _dst.getMat();
01848         int dcn = (int)dst.total();
01849         CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
01850                    (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
01851         double* dptr = dst.ptr<double>();
01852         for( k = 0; k < cn; k++ )
01853             dptr[k] = sptr[k];
01854         for( ; k < dcn; k++ )
01855             dptr[k] = 0;
01856     }
01857 }
01858 
01859 /****************************************************************************************\
01860 *                                       minMaxLoc                                        *
01861 \****************************************************************************************/
01862 
01863 namespace cv
01864 {
01865 
01866 template<typename T, typename WT> static void
01867 minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal,
01868             size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx )
01869 {
01870     WT minVal = *_minVal, maxVal = *_maxVal;
01871     size_t minIdx = *_minIdx, maxIdx = *_maxIdx;
01872 
01873     if( !mask )
01874     {
01875         for( int i = 0; i < len; i++ )
01876         {
01877             T val = src[i];
01878             if( val < minVal )
01879             {
01880                 minVal = val;
01881                 minIdx = startIdx + i;
01882             }
01883             if( val > maxVal )
01884             {
01885                 maxVal = val;
01886                 maxIdx = startIdx + i;
01887             }
01888         }
01889     }
01890     else
01891     {
01892         for( int i = 0; i < len; i++ )
01893         {
01894             T val = src[i];
01895             if( mask[i] && val < minVal )
01896             {
01897                 minVal = val;
01898                 minIdx = startIdx + i;
01899             }
01900             if( mask[i] && val > maxVal )
01901             {
01902                 maxVal = val;
01903                 maxIdx = startIdx + i;
01904             }
01905         }
01906     }
01907 
01908     *_minIdx = minIdx;
01909     *_maxIdx = maxIdx;
01910     *_minVal = minVal;
01911     *_maxVal = maxVal;
01912 }
01913 
01914 static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval,
01915                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
01916 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01917 
01918 static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval,
01919                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
01920 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01921 
01922 static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval,
01923                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
01924 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01925 
01926 static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval,
01927                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
01928 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01929 
01930 static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval,
01931                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
01932 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01933 
01934 static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval,
01935                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
01936 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01937 
01938 static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval,
01939                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
01940 { minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); }
01941 
01942 typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t);
01943 
01944 static MinMaxIdxFunc getMinmaxTab(int depth)
01945 {
01946     static MinMaxIdxFunc minmaxTab[] =
01947     {
01948         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
01949         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
01950         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s),
01951         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f),
01952         0
01953     };
01954 
01955     return minmaxTab[depth];
01956 }
01957 
01958 static void ofs2idx(const Mat& a, size_t ofs, int* idx)
01959 {
01960     int i, d = a.dims;
01961     if( ofs > 0 )
01962     {
01963         ofs--;
01964         for( i = d-1; i >= 0; i-- )
01965         {
01966             int sz = a.size[i];
01967             idx[i] = (int)(ofs % sz);
01968             ofs /= sz;
01969         }
01970     }
01971     else
01972     {
01973         for( i = d-1; i >= 0; i-- )
01974             idx[i] = -1;
01975     }
01976 }
01977 
01978 #ifdef HAVE_OPENCL
01979 
01980 #define MINMAX_STRUCT_ALIGNMENT 8 // sizeof double
01981 
01982 template <typename T>
01983 void getMinMaxRes(const Mat & db, double * minVal, double * maxVal,
01984                   int* minLoc, int* maxLoc,
01985                   int groupnum, int cols, double * maxVal2)
01986 {
01987     uint index_max = std::numeric_limits<uint>::max();
01988     T minval = std::numeric_limits<T>::max();
01989     T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval;
01990     uint minloc = index_max, maxloc = index_max;
01991 
01992     size_t index = 0;
01993     const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL;
01994     const uint * minlocptr = NULL, * maxlocptr = NULL;
01995     if (minVal || minLoc)
01996     {
01997         minptr = db.ptr<T>();
01998         index += sizeof(T) * groupnum;
01999         index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
02000     }
02001     if (maxVal || maxLoc)
02002     {
02003         maxptr = (const T *)(db.ptr() + index);
02004         index += sizeof(T) * groupnum;
02005         index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
02006     }
02007     if (minLoc)
02008     {
02009         minlocptr = (const uint *)(db.ptr() + index);
02010         index += sizeof(uint) * groupnum;
02011         index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
02012     }
02013     if (maxLoc)
02014     {
02015         maxlocptr = (const uint *)(db.ptr() + index);
02016         index += sizeof(uint) * groupnum;
02017         index = alignSize(index, MINMAX_STRUCT_ALIGNMENT);
02018     }
02019     if (maxVal2)
02020         maxptr2 = (const T *)(db.ptr() + index);
02021 
02022     for (int i = 0; i < groupnum; i++)
02023     {
02024         if (minptr && minptr[i] <= minval)
02025         {
02026             if (minptr[i] == minval)
02027             {
02028                 if (minlocptr)
02029                     minloc = std::min(minlocptr[i], minloc);
02030             }
02031             else
02032             {
02033                 if (minlocptr)
02034                     minloc = minlocptr[i];
02035                 minval = minptr[i];
02036             }
02037         }
02038         if (maxptr && maxptr[i] >= maxval)
02039         {
02040             if (maxptr[i] == maxval)
02041             {
02042                 if (maxlocptr)
02043                     maxloc = std::min(maxlocptr[i], maxloc);
02044             }
02045             else
02046             {
02047                 if (maxlocptr)
02048                     maxloc = maxlocptr[i];
02049                 maxval = maxptr[i];
02050             }
02051         }
02052         if (maxptr2 && maxptr2[i] > maxval2)
02053             maxval2 = maxptr2[i];
02054     }
02055     bool zero_mask = (minLoc && minloc == index_max) ||
02056             (maxLoc && maxloc == index_max);
02057 
02058     if (minVal)
02059         *minVal = zero_mask ? 0 : (double)minval;
02060     if (maxVal)
02061         *maxVal = zero_mask ? 0 : (double)maxval;
02062     if (maxVal2)
02063         *maxVal2 = zero_mask ? 0 : (double)maxval2;
02064 
02065     if (minLoc)
02066     {
02067         minLoc[0] = zero_mask ? -1 : minloc / cols;
02068         minLoc[1] = zero_mask ? -1 : minloc % cols;
02069     }
02070     if (maxLoc)
02071     {
02072         maxLoc[0] = zero_mask ? -1 : maxloc / cols;
02073         maxLoc[1] = zero_mask ? -1 : maxloc % cols;
02074     }
02075 }
02076 
02077 typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal,
02078                                  int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2);
02079 
02080 static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
02081                            int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL)
02082 {
02083     const ocl::Device & dev = ocl::Device::getDefault();
02084 
02085 #ifdef ANDROID
02086     if (dev.isNVidia())
02087         return false;
02088 #endif
02089 
02090     bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
02091         haveSrc2 = _src2.kind() != _InputArray::NONE;
02092     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
02093             kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2));
02094 
02095     // disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014)
02096     if ((haveMask || type == CV_32FC1) && dev.isAMD())
02097         return false;
02098 
02099     CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) ||
02100               (cn >= 1 && !minLoc && !maxLoc) );
02101 
02102     if (ddepth < 0)
02103         ddepth = depth;
02104 
02105     CV_Assert(!haveSrc2 || _src2.type() == type);
02106 
02107     if (depth == CV_32S)
02108         return false;
02109 
02110     if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
02111         return false;
02112 
02113     int groupnum = dev.maxComputeUnits();
02114     size_t wgs = dev.maxWorkGroupSize();
02115 
02116     int wgs2_aligned = 1;
02117     while (wgs2_aligned < (int)wgs)
02118         wgs2_aligned <<= 1;
02119     wgs2_aligned >>= 1;
02120 
02121     bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL,
02122             needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL;
02123 
02124     // in case of mask we must know whether mask is filled with zeros or not
02125     // so let's calculate min or max location, if it's undefined, so mask is zeros
02126     if (!(needMaxLoc || needMinLoc) && haveMask)
02127     {
02128         if (needMinVal)
02129             needMinLoc = true;
02130         else
02131             needMaxLoc = true;
02132     }
02133 
02134     char cvt[2][40];
02135     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
02136                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
02137                          " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s"
02138                          " -D MINMAX_STRUCT_ALIGNMENT=%d",
02139                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
02140                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
02141                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
02142                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
02143                          _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
02144                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
02145                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
02146                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
02147                          ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]),
02148                          absValues ? " -D OP_ABS" : "",
02149                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
02150                          haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth,
02151                          depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert",
02152                          MINMAX_STRUCT_ALIGNMENT);
02153 
02154     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
02155     if (k.empty())
02156         return false;
02157 
02158     int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
02159             dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
02160                                  (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
02161                                  (maxVal2 ? esz : 0))
02162                      + 5 * MINMAX_STRUCT_ALIGNMENT;
02163     UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
02164 
02165     if (cn > 1 && !haveMask)
02166     {
02167         src = src.reshape(1);
02168         src2 = src2.reshape(1);
02169     }
02170 
02171     if (haveSrc2)
02172     {
02173         if (!haveMask)
02174             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
02175                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
02176         else
02177             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
02178                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
02179                    ocl::KernelArg::ReadOnlyNoSize(src2));
02180     }
02181     else
02182     {
02183         if (!haveMask)
02184             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
02185                    groupnum, ocl::KernelArg::PtrWriteOnly(db));
02186         else
02187             k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
02188                    groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
02189     }
02190 
02191     size_t globalsize = groupnum * wgs;
02192     if (!k.run(1, &globalsize, &wgs, true))
02193         return false;
02194 
02195     static const getMinMaxResFunc functab[7] =
02196     {
02197         getMinMaxRes<uchar>,
02198         getMinMaxRes<char>,
02199         getMinMaxRes<ushort>,
02200         getMinMaxRes<short>,
02201         getMinMaxRes<int>,
02202         getMinMaxRes<float>,
02203         getMinMaxRes<double>
02204     };
02205 
02206     getMinMaxResFunc func = functab[ddepth];
02207 
02208     int locTemp[2];
02209     func(db.getMat(ACCESS_READ), minVal, maxVal,
02210          needMinLoc ? minLoc ? minLoc : locTemp : minLoc,
02211          needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc,
02212          groupnum, src.cols, maxVal2);
02213 
02214     return true;
02215 }
02216 
02217 #endif
02218 
02219 #ifdef HAVE_IPP
02220 static bool ipp_minMaxIdx( Mat &src, double* minVal, double* maxVal, int* minIdx, int* maxIdx, Mat &mask)
02221 {
02222 #if IPP_VERSION_X100 >= 700
02223     int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02224     size_t total_size = src.total();
02225     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
02226     if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
02227     {
02228         IppiSize sz = { cols * cn, rows };
02229 
02230         if( !mask.empty() )
02231         {
02232             typedef IppStatus (CV_STDCALL* ippiMaskMinMaxIndxFuncC1)(const void *, int, const void *, int,
02233                                                                         IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
02234 
02235             CV_SUPPRESS_DEPRECATED_START
02236             ippiMaskMinMaxIndxFuncC1 ippFuncC1 =
02237                 type == CV_8UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1MR :
02238 #if IPP_VERSION_X100 < 900
02239                 type == CV_8SC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1MR :
02240 #endif
02241                 type == CV_16UC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1MR :
02242                 type == CV_32FC1 ? (ippiMaskMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1MR : 0;
02243             CV_SUPPRESS_DEPRECATED_END
02244 
02245             if( ippFuncC1 )
02246             {
02247                 Ipp32f min, max;
02248                 IppiPoint minp, maxp;
02249                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
02250                 {
02251                     if( minVal )
02252                         *minVal = (double)min;
02253                     if( maxVal )
02254                         *maxVal = (double)max;
02255                     if( !minp.x && !minp.y && !maxp.x && !maxp.y && !mask.ptr()[0] )
02256                         minp.x = maxp.x = -1;
02257                     if( minIdx )
02258                     {
02259                         size_t minidx = minp.y * cols + minp.x + 1;
02260                         ofs2idx(src, minidx, minIdx);
02261                     }
02262                     if( maxIdx )
02263                     {
02264                         size_t maxidx = maxp.y * cols + maxp.x + 1;
02265                         ofs2idx(src, maxidx, maxIdx);
02266                     }
02267                     return true;
02268                 }
02269             }
02270         }
02271         else
02272         {
02273             typedef IppStatus (CV_STDCALL* ippiMinMaxIndxFuncC1)(const void *, int, IppiSize, Ipp32f *, Ipp32f *, IppiPoint *, IppiPoint *);
02274 
02275             CV_SUPPRESS_DEPRECATED_START
02276             ippiMinMaxIndxFuncC1 ippFuncC1 =
02277 #if IPP_VERSION_X100 != 900 // bug in 9.0.0 avx2 optimization
02278                 depth == CV_8U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8u_C1R :
02279 #endif
02280 #if IPP_VERSION_X100 < 900
02281                 depth == CV_8S ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_8s_C1R :
02282 #endif
02283                 depth == CV_16U ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_16u_C1R :
02284 #if !((defined _MSC_VER && defined _M_IX86) || defined __i386__)
02285                 depth == CV_32F ? (ippiMinMaxIndxFuncC1)ippiMinMaxIndx_32f_C1R :
02286 #endif
02287                 0;
02288             CV_SUPPRESS_DEPRECATED_END
02289 
02290             if( ippFuncC1 )
02291             {
02292                 Ipp32f min, max;
02293                 IppiPoint minp, maxp;
02294                 if( ippFuncC1(src.ptr(), (int)src.step[0], sz, &min, &max, &minp, &maxp) >= 0 )
02295                 {
02296                     if( minVal )
02297                         *minVal = (double)min;
02298                     if( maxVal )
02299                         *maxVal = (double)max;
02300                     if( minIdx )
02301                     {
02302                         size_t minidx = minp.y * cols + minp.x + 1;
02303                         ofs2idx(src, minidx, minIdx);
02304                     }
02305                     if( maxIdx )
02306                     {
02307                         size_t maxidx = maxp.y * cols + maxp.x + 1;
02308                         ofs2idx(src, maxidx, maxIdx);
02309                     }
02310                     return true;
02311                 }
02312             }
02313         }
02314     }
02315 #else
02316 #endif
02317     CV_UNUSED(src); CV_UNUSED(minVal); CV_UNUSED(maxVal); CV_UNUSED(minIdx); CV_UNUSED(maxIdx); CV_UNUSED(mask);
02318     return false;
02319 }
02320 #endif
02321 
02322 }
02323 
02324 void cv::minMaxIdx(InputArray _src, double* minVal,
02325                    double* maxVal, int* minIdx, int* maxIdx,
02326                    InputArray _mask)
02327 {
02328     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02329     CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
02330         (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );
02331 
02332 #ifdef HAVE_OPENCL
02333     CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
02334                ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))
02335 #endif
02336 
02337     Mat src = _src.getMat(), mask = _mask.getMat();
02338     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask))
02339 
02340     MinMaxIdxFunc func = getMinmaxTab(depth);
02341     CV_Assert( func != 0 );
02342 
02343     const Mat* arrays[] = {&src, &mask, 0};
02344     uchar* ptrs[2];
02345     NAryMatIterator it(arrays, ptrs);
02346 
02347     size_t minidx = 0, maxidx = 0;
02348     int iminval = INT_MAX, imaxval = INT_MIN;
02349     float  fminval = std::numeric_limits<float>::infinity(),  fmaxval = -fminval;
02350     double dminval = std::numeric_limits<double>::infinity(), dmaxval = -dminval;
02351     size_t startidx = 1;
02352     int *minval = &iminval, *maxval = &imaxval;
02353     int planeSize = (int)it.size*cn;
02354 
02355     if( depth == CV_32F )
02356         minval = (int*)&fminval, maxval = (int*)&fmaxval;
02357     else if( depth == CV_64F )
02358         minval = (int*)&dminval, maxval = (int*)&dmaxval;
02359 
02360     for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize )
02361         func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx );
02362 
02363     if (!src.empty() && mask.empty())
02364     {
02365         if( minidx == 0 )
02366              minidx = 1;
02367          if( maxidx == 0 )
02368              maxidx = 1;
02369     }
02370 
02371     if( minidx == 0 )
02372         dminval = dmaxval = 0;
02373     else if( depth == CV_32F )
02374         dminval = fminval, dmaxval = fmaxval;
02375     else if( depth <= CV_32S )
02376         dminval = iminval, dmaxval = imaxval;
02377 
02378     if( minVal )
02379         *minVal = dminval;
02380     if( maxVal )
02381         *maxVal = dmaxval;
02382 
02383     if( minIdx )
02384         ofs2idx(src, minidx, minIdx);
02385     if( maxIdx )
02386         ofs2idx(src, maxidx, maxIdx);
02387 }
02388 
02389 void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
02390                     Point* minLoc, Point* maxLoc, InputArray mask )
02391 {
02392     CV_Assert(_img.dims() <= 2);
02393 
02394     minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask);
02395     if( minLoc )
02396         std::swap(minLoc->x, minLoc->y);
02397     if( maxLoc )
02398         std::swap(maxLoc->x, maxLoc->y);
02399 }
02400 
02401 /****************************************************************************************\
02402 *                                         norm                                           *
02403 \****************************************************************************************/
02404 
02405 namespace cv
02406 {
02407 
02408 template<typename T, typename ST> int
02409 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
02410 {
02411     ST result = *_result;
02412     if( !mask )
02413     {
02414         result = std::max(result, normInf<T, ST>(src, len*cn));
02415     }
02416     else
02417     {
02418         for( int i = 0; i < len; i++, src += cn )
02419             if( mask[i] )
02420             {
02421                 for( int k = 0; k < cn; k++ )
02422                     result = std::max(result, ST(cv_abs(src[k])));
02423             }
02424     }
02425     *_result = result;
02426     return 0;
02427 }
02428 
02429 template<typename T, typename ST> int
02430 normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
02431 {
02432     ST result = *_result;
02433     if( !mask )
02434     {
02435         result += normL1<T, ST>(src, len*cn);
02436     }
02437     else
02438     {
02439         for( int i = 0; i < len; i++, src += cn )
02440             if( mask[i] )
02441             {
02442                 for( int k = 0; k < cn; k++ )
02443                     result += cv_abs(src[k]);
02444             }
02445     }
02446     *_result = result;
02447     return 0;
02448 }
02449 
02450 template<typename T, typename ST> int
02451 normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
02452 {
02453     ST result = *_result;
02454     if( !mask )
02455     {
02456         result += normL2Sqr<T, ST>(src, len*cn);
02457     }
02458     else
02459     {
02460         for( int i = 0; i < len; i++, src += cn )
02461             if( mask[i] )
02462             {
02463                 for( int k = 0; k < cn; k++ )
02464                 {
02465                     T v = src[k];
02466                     result += (ST)v*v;
02467                 }
02468             }
02469     }
02470     *_result = result;
02471     return 0;
02472 }
02473 
02474 template<typename T, typename ST> int
02475 normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
02476 {
02477     ST result = *_result;
02478     if( !mask )
02479     {
02480         result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
02481     }
02482     else
02483     {
02484         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
02485             if( mask[i] )
02486             {
02487                 for( int k = 0; k < cn; k++ )
02488                     result = std::max(result, (ST)std::abs(src1[k] - src2[k]));
02489             }
02490     }
02491     *_result = result;
02492     return 0;
02493 }
02494 
02495 template<typename T, typename ST> int
02496 normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
02497 {
02498     ST result = *_result;
02499     if( !mask )
02500     {
02501         result += normL1<T, ST>(src1, src2, len*cn);
02502     }
02503     else
02504     {
02505         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
02506             if( mask[i] )
02507             {
02508                 for( int k = 0; k < cn; k++ )
02509                     result += std::abs(src1[k] - src2[k]);
02510             }
02511     }
02512     *_result = result;
02513     return 0;
02514 }
02515 
02516 template<typename T, typename ST> int
02517 normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int len, int cn)
02518 {
02519     ST result = *_result;
02520     if( !mask )
02521     {
02522         result += normL2Sqr<T, ST>(src1, src2, len*cn);
02523     }
02524     else
02525     {
02526         for( int i = 0; i < len; i++, src1 += cn, src2 += cn )
02527             if( mask[i] )
02528             {
02529                 for( int k = 0; k < cn; k++ )
02530                 {
02531                     ST v = src1[k] - src2[k];
02532                     result += v*v;
02533                 }
02534             }
02535     }
02536     *_result = result;
02537     return 0;
02538 }
02539 
02540 Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
02541 {
02542     return cv::hal::normHamming(a, b, size);
02543 }
02544 
02545 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
02546     static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
02547 { return norm##L##_(src, mask, r, len, cn); } \
02548     static int normDiff##L##_##suffix(const type* src1, const type* src2, \
02549     const uchar* mask, ntype* r, int len, int cn) \
02550 { return normDiff##L##_(src1, src2, mask, r, (int)len, cn); }
02551 
02552 #define CV_DEF_NORM_ALL(suffix, type, inftype, l1type, l2type) \
02553     CV_DEF_NORM_FUNC(Inf, suffix, type, inftype) \
02554     CV_DEF_NORM_FUNC(L1, suffix, type, l1type) \
02555     CV_DEF_NORM_FUNC(L2, suffix, type, l2type)
02556 
02557 CV_DEF_NORM_ALL(8u, uchar, int, int, int)
02558 CV_DEF_NORM_ALL(8s, schar, int, int, int)
02559 CV_DEF_NORM_ALL(16u, ushort, int, int, double)
02560 CV_DEF_NORM_ALL(16s, short, int, int, double)
02561 CV_DEF_NORM_ALL(32s, int, int, double, double)
02562 CV_DEF_NORM_ALL(32f, float, float, double, double)
02563 CV_DEF_NORM_ALL(64f, double, double, double, double)
02564 
02565 
02566 typedef int (*NormFunc)(const uchar*, const uchar*, uchar*, int, int);
02567 typedef int (*NormDiffFunc)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
02568 
02569 static NormFunc getNormFunc(int normType, int depth)
02570 {
02571     static NormFunc normTab[3][8] =
02572     {
02573         {
02574             (NormFunc)GET_OPTIMIZED(normInf_8u), (NormFunc)GET_OPTIMIZED(normInf_8s), (NormFunc)GET_OPTIMIZED(normInf_16u), (NormFunc)GET_OPTIMIZED(normInf_16s),
02575             (NormFunc)GET_OPTIMIZED(normInf_32s), (NormFunc)GET_OPTIMIZED(normInf_32f), (NormFunc)normInf_64f, 0
02576         },
02577         {
02578             (NormFunc)GET_OPTIMIZED(normL1_8u), (NormFunc)GET_OPTIMIZED(normL1_8s), (NormFunc)GET_OPTIMIZED(normL1_16u), (NormFunc)GET_OPTIMIZED(normL1_16s),
02579             (NormFunc)GET_OPTIMIZED(normL1_32s), (NormFunc)GET_OPTIMIZED(normL1_32f), (NormFunc)normL1_64f, 0
02580         },
02581         {
02582             (NormFunc)GET_OPTIMIZED(normL2_8u), (NormFunc)GET_OPTIMIZED(normL2_8s), (NormFunc)GET_OPTIMIZED(normL2_16u), (NormFunc)GET_OPTIMIZED(normL2_16s),
02583             (NormFunc)GET_OPTIMIZED(normL2_32s), (NormFunc)GET_OPTIMIZED(normL2_32f), (NormFunc)normL2_64f, 0
02584         }
02585     };
02586 
02587     return normTab[normType][depth];
02588 }
02589 
02590 static NormDiffFunc getNormDiffFunc(int normType, int depth)
02591 {
02592     static NormDiffFunc normDiffTab[3][8] =
02593     {
02594         {
02595             (NormDiffFunc)GET_OPTIMIZED(normDiffInf_8u), (NormDiffFunc)normDiffInf_8s,
02596             (NormDiffFunc)normDiffInf_16u, (NormDiffFunc)normDiffInf_16s,
02597             (NormDiffFunc)normDiffInf_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffInf_32f),
02598             (NormDiffFunc)normDiffInf_64f, 0
02599         },
02600         {
02601             (NormDiffFunc)GET_OPTIMIZED(normDiffL1_8u), (NormDiffFunc)normDiffL1_8s,
02602             (NormDiffFunc)normDiffL1_16u, (NormDiffFunc)normDiffL1_16s,
02603             (NormDiffFunc)normDiffL1_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL1_32f),
02604             (NormDiffFunc)normDiffL1_64f, 0
02605         },
02606         {
02607             (NormDiffFunc)GET_OPTIMIZED(normDiffL2_8u), (NormDiffFunc)normDiffL2_8s,
02608             (NormDiffFunc)normDiffL2_16u, (NormDiffFunc)normDiffL2_16s,
02609             (NormDiffFunc)normDiffL2_32s, (NormDiffFunc)GET_OPTIMIZED(normDiffL2_32f),
02610             (NormDiffFunc)normDiffL2_64f, 0
02611         }
02612     };
02613 
02614     return normDiffTab[normType][depth];
02615 }
02616 
02617 #ifdef HAVE_OPENCL
02618 
02619 static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
02620 {
02621     const ocl::Device & d = ocl::Device::getDefault();
02622 
02623 #ifdef ANDROID
02624     if (d.isNVidia())
02625         return false;
02626 #endif
02627 
02628     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02629     bool doubleSupport = d.doubleFPConfig() > 0,
02630             haveMask = _mask.kind() != _InputArray::NONE;
02631 
02632     if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
02633          (!doubleSupport && depth == CV_64F))
02634         return false;
02635 
02636     UMat src = _src.getUMat();
02637 
02638     if (normType == NORM_INF)
02639     {
02640         if (!ocl_minMaxIdx(_src, NULL, &result, NULL, NULL, _mask,
02641                            std::max(depth, CV_32S), depth != CV_8U && depth != CV_16U))
02642             return false;
02643     }
02644     else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
02645     {
02646         Scalar sc;
02647         bool unstype = depth == CV_8U || depth == CV_16U;
02648 
02649         if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
02650                     OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
02651             return false;
02652 
02653         if (!haveMask)
02654             cn = 1;
02655 
02656         double s = 0.0;
02657         for (int i = 0; i < cn; ++i)
02658             s += sc[i];
02659 
02660         result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
02661     }
02662 
02663     return true;
02664 }
02665 
02666 #endif
02667 
02668 #ifdef HAVE_IPP
02669 static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
02670 {
02671 #if IPP_VERSION_X100 >= 700
02672     int cn = src.channels();
02673     size_t total_size = src.total();
02674     int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
02675 
02676     if( (src.dims == 2 || (src.isContinuous() && mask.isContinuous()))
02677         && cols > 0 && (size_t)rows*cols == total_size
02678         && (normType == NORM_INF || normType == NORM_L1 ||
02679             normType == NORM_L2 || normType == NORM_L2SQR) )
02680     {
02681         IppiSize sz = { cols, rows };
02682         int type = src.type();
02683         if( !mask.empty() )
02684         {
02685             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
02686             ippiMaskNormFuncC1 ippFuncC1 =
02687                 normType == NORM_INF ?
02688                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8u_C1MR :
02689 #if IPP_VERSION_X100 < 900
02690                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_8s_C1MR :
02691 #endif
02692 //                type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_16u_C1MR :
02693                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_Inf_32f_C1MR :
02694                 0) :
02695             normType == NORM_L1 ?
02696                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8u_C1MR :
02697 #if IPP_VERSION_X100 < 900
02698                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_8s_C1MR :
02699 #endif
02700                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_16u_C1MR :
02701                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L1_32f_C1MR :
02702                 0) :
02703             normType == NORM_L2 || normType == NORM_L2SQR ?
02704                 (type == CV_8UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8u_C1MR :
02705 #if IPP_VERSION_X100 < 900
02706                 type == CV_8SC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_8s_C1MR :
02707 #endif
02708                 type == CV_16UC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_16u_C1MR :
02709                 type == CV_32FC1 ? (ippiMaskNormFuncC1)ippiNorm_L2_32f_C1MR :
02710                 0) : 0;
02711             if( ippFuncC1 )
02712             {
02713                 Ipp64f norm;
02714                 if( ippFuncC1(src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
02715                 {
02716                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
02717                     return true;
02718                 }
02719             }
02720 #if IPP_DISABLE_BLOCK
02721             typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
02722             ippiMaskNormFuncC3 ippFuncC3 =
02723                 normType == NORM_INF ?
02724                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
02725                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8s_C3CMR :
02726                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_16u_C3CMR :
02727                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_32f_C3CMR :
02728                 0) :
02729             normType == NORM_L1 ?
02730                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8u_C3CMR :
02731                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_8s_C3CMR :
02732                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_16u_C3CMR :
02733                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L1_32f_C3CMR :
02734                 0) :
02735             normType == NORM_L2 || normType == NORM_L2SQR ?
02736                 (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8u_C3CMR :
02737                 type == CV_8SC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_8s_C3CMR :
02738                 type == CV_16UC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_16u_C3CMR :
02739                 type == CV_32FC3 ? (ippiMaskNormFuncC3)ippiNorm_L2_32f_C3CMR :
02740                 0) : 0;
02741             if( ippFuncC3 )
02742             {
02743                 Ipp64f norm1, norm2, norm3;
02744                 if( ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
02745                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
02746                     ippFuncC3(src.data, (int)src.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
02747                 {
02748                     Ipp64f norm =
02749                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
02750                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
02751                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
02752                         0;
02753                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
02754                     return true;
02755                 }
02756             }
02757 #endif
02758         }
02759         else
02760         {
02761             typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
02762             typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
02763             ippiNormFuncHint ippFuncHint =
02764                 normType == NORM_L1 ?
02765                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
02766                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R :
02767                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R :
02768                 0) :
02769                 normType == NORM_L2 || normType == NORM_L2SQR ?
02770                 (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
02771                 type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R :
02772                 type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R :
02773                 0) : 0;
02774             ippiNormFuncNoHint ippFuncNoHint =
02775                 normType == NORM_INF ?
02776                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
02777                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R :
02778                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R :
02779                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
02780                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R :
02781                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R :
02782                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
02783 #if (IPP_VERSION_X100 >= 810)
02784                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
02785                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
02786 #endif
02787                 type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
02788                 type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R :
02789                 type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R :
02790                 0) :
02791                 normType == NORM_L1 ?
02792                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
02793                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R :
02794                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R :
02795                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
02796                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R :
02797                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R :
02798                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
02799                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R :
02800                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R :
02801                 0) :
02802                 normType == NORM_L2 || normType == NORM_L2SQR ?
02803                 (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
02804                 type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R :
02805                 type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R :
02806                 type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
02807                 type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R :
02808                 type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R :
02809                 type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
02810                 type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R :
02811                 type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R :
02812                 0) : 0;
02813             // Make sure only zero or one version of the function pointer is valid
02814             CV_Assert(!ippFuncHint || !ippFuncNoHint);
02815             if( ippFuncHint || ippFuncNoHint )
02816             {
02817                 Ipp64f norm_array[4];
02818                 IppStatus ret = ippFuncHint ? ippFuncHint(src.ptr(), (int)src.step[0], sz, norm_array, ippAlgHintAccurate) :
02819                                 ippFuncNoHint(src.ptr(), (int)src.step[0], sz, norm_array);
02820                 if( ret >= 0 )
02821                 {
02822                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
02823                     for( int i = 1; i < cn; i++ )
02824                     {
02825                         norm =
02826                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
02827                             normType == NORM_L1 ? norm + norm_array[i] :
02828                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
02829                             0;
02830                     }
02831                     result = (normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm);
02832                     return true;
02833                 }
02834             }
02835         }
02836     }
02837 #else
02838     CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
02839 #endif
02840     return false;
02841 }
02842 #endif
02843 }
02844 
02845 double cv::norm( InputArray _src, int normType, InputArray _mask )
02846 {
02847     normType &= NORM_TYPE_MASK;
02848     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
02849                normType == NORM_L2 || normType == NORM_L2SQR ||
02850                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
02851 
02852 #if defined HAVE_OPENCL || defined HAVE_IPP
02853     double _result = 0;
02854 #endif
02855 
02856 #ifdef HAVE_OPENCL
02857     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
02858                 ocl_norm(_src, normType, _mask, _result),
02859                 _result)
02860 #endif
02861 
02862     Mat src = _src.getMat(), mask = _mask.getMat();
02863     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(src, normType, mask, _result), _result);
02864 
02865     int depth = src.depth(), cn = src.channels();
02866     if( src.isContinuous() && mask.empty() )
02867     {
02868         size_t len = src.total()*cn;
02869         if( len == (size_t)(int)len )
02870         {
02871             if( depth == CV_32F )
02872             {
02873                 const float* data = src.ptr<float>();
02874 
02875                 if( normType == NORM_L2 )
02876                 {
02877                     double result = 0;
02878                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
02879                     return std::sqrt(result);
02880                 }
02881                 if( normType == NORM_L2SQR )
02882                 {
02883                     double result = 0;
02884                     GET_OPTIMIZED(normL2_32f)(data, 0, &result, (int)len, 1);
02885                     return result;
02886                 }
02887                 if( normType == NORM_L1 )
02888                 {
02889                     double result = 0;
02890                     GET_OPTIMIZED(normL1_32f)(data, 0, &result, (int)len, 1);
02891                     return result;
02892                 }
02893                 if( normType == NORM_INF )
02894                 {
02895                     float result = 0;
02896                     GET_OPTIMIZED(normInf_32f)(data, 0, &result, (int)len, 1);
02897                     return result;
02898                 }
02899             }
02900             if( depth == CV_8U )
02901             {
02902                 const uchar* data = src.ptr<uchar>();
02903 
02904                 if( normType == NORM_HAMMING )
02905                 {
02906                     return hal::normHamming(data, (int)len);
02907                 }
02908 
02909                 if( normType == NORM_HAMMING2 )
02910                 {
02911                     return hal::normHamming(data, (int)len, 2);
02912                 }
02913             }
02914         }
02915     }
02916 
02917     CV_Assert( mask.empty() || mask.type() == CV_8U );
02918 
02919     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
02920     {
02921         if( !mask.empty() )
02922         {
02923             Mat temp;
02924             bitwise_and(src, mask, temp);
02925             return norm(temp, normType);
02926         }
02927         int cellSize = normType == NORM_HAMMING ? 1 : 2;
02928 
02929         const Mat* arrays[] = {&src, 0};
02930         uchar* ptrs[1];
02931         NAryMatIterator it(arrays, ptrs);
02932         int total = (int)it.size;
02933         int result = 0;
02934 
02935         for( size_t i = 0; i < it.nplanes; i++, ++it )
02936         {
02937             result += hal::normHamming(ptrs[0], total, cellSize);
02938         }
02939 
02940         return result;
02941     }
02942 
02943     NormFunc func = getNormFunc(normType >> 1, depth);
02944     CV_Assert( func != 0 );
02945 
02946     const Mat* arrays[] = {&src, &mask, 0};
02947     uchar* ptrs[2];
02948     union
02949     {
02950         double d;
02951         int i;
02952         float f;
02953     }
02954     result;
02955     result.d = 0;
02956     NAryMatIterator it(arrays, ptrs);
02957     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
02958     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
02959             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
02960     int isum = 0;
02961     int *ibuf = &result.i;
02962     size_t esz = 0;
02963 
02964     if( blockSum )
02965     {
02966         intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
02967         blockSize = std::min(blockSize, intSumBlockSize);
02968         ibuf = &isum;
02969         esz = src.elemSize();
02970     }
02971 
02972     for( size_t i = 0; i < it.nplanes; i++, ++it )
02973     {
02974         for( j = 0; j < total; j += blockSize )
02975         {
02976             int bsz = std::min(total - j, blockSize);
02977             func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
02978             count += bsz;
02979             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
02980             {
02981                 result.d += isum;
02982                 isum = 0;
02983                 count = 0;
02984             }
02985             ptrs[0] += bsz*esz;
02986             if( ptrs[1] )
02987                 ptrs[1] += bsz;
02988         }
02989     }
02990 
02991     if( normType == NORM_INF )
02992     {
02993         if( depth == CV_64F )
02994             ;
02995         else if( depth == CV_32F )
02996             result.d = result.f;
02997         else
02998             result.d = result.i;
02999     }
03000     else if( normType == NORM_L2 )
03001         result.d = std::sqrt(result.d);
03002 
03003     return result.d;
03004 }
03005 
03006 #ifdef HAVE_OPENCL
03007 
03008 namespace cv {
03009 
03010 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
03011 {
03012 #ifdef ANDROID
03013     if (ocl::Device::getDefault().isNVidia())
03014         return false;
03015 #endif
03016 
03017     Scalar sc1, sc2;
03018     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
03019     bool relative = (normType & NORM_RELATIVE) != 0;
03020     normType &= ~NORM_RELATIVE;
03021     bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
03022 
03023     if (normsum)
03024     {
03025         if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
03026                      OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
03027             return false;
03028     }
03029     else
03030     {
03031         if (!ocl_minMaxIdx(_src1, NULL, &sc1[0], NULL, NULL, _mask, std::max(CV_32S, depth),
03032                            false, _src2, relative ? &sc2[0] : NULL))
03033             return false;
03034         cn = 1;
03035     }
03036 
03037     double s2 = 0;
03038     for (int i = 0; i < cn; ++i)
03039     {
03040         result += sc1[i];
03041         if (relative)
03042             s2 += sc2[i];
03043     }
03044 
03045     if (normType == NORM_L2)
03046     {
03047         result = std::sqrt(result);
03048         if (relative)
03049             s2 = std::sqrt(s2);
03050     }
03051 
03052     if (relative)
03053         result /= (s2 + DBL_EPSILON);
03054 
03055     return true;
03056 }
03057 
03058 }
03059 
03060 #endif
03061 
03062 #ifdef HAVE_IPP
03063 namespace cv
03064 {
03065 static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
03066 {
03067 #if IPP_VERSION_X100 >= 700
03068     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
03069 
03070     if( normType & CV_RELATIVE )
03071     {
03072         normType &= NORM_TYPE_MASK;
03073         CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
03074                 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
03075         size_t total_size = src1.total();
03076         int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
03077         if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
03078             && cols > 0 && (size_t)rows*cols == total_size
03079             && (normType == NORM_INF || normType == NORM_L1 ||
03080                 normType == NORM_L2 || normType == NORM_L2SQR) )
03081         {
03082             IppiSize sz = { cols, rows };
03083             int type = src1.type();
03084             if( !mask.empty() )
03085             {
03086                 typedef IppStatus (CV_STDCALL* ippiMaskNormRelFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
03087                 ippiMaskNormRelFuncC1 ippFuncC1 =
03088                     normType == NORM_INF ?
03089                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8u_C1MR :
03090 #if IPP_VERSION_X100 < 900
03091 #ifndef __APPLE__
03092                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_8s_C1MR :
03093 #endif
03094 #endif
03095                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_16u_C1MR :
03096                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_Inf_32f_C1MR :
03097                     0) :
03098                     normType == NORM_L1 ?
03099                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8u_C1MR :
03100 #if IPP_VERSION_X100 < 900
03101 #ifndef __APPLE__
03102                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_8s_C1MR :
03103 #endif
03104 #endif
03105                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_16u_C1MR :
03106                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L1_32f_C1MR :
03107                     0) :
03108                     normType == NORM_L2 || normType == NORM_L2SQR ?
03109                     (type == CV_8UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8u_C1MR :
03110 #if IPP_VERSION_X100 < 900
03111                     type == CV_8SC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_8s_C1MR :
03112 #endif
03113                     type == CV_16UC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_16u_C1MR :
03114                     type == CV_32FC1 ? (ippiMaskNormRelFuncC1)ippiNormRel_L2_32f_C1MR :
03115                     0) : 0;
03116                 if( ippFuncC1 )
03117                 {
03118                     Ipp64f norm;
03119                     if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
03120                     {
03121                         result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
03122                         return true;
03123                     }
03124                 }
03125             }
03126             else
03127             {
03128                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
03129                 typedef IppStatus (CV_STDCALL* ippiNormRelFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
03130                 ippiNormRelFuncNoHint ippFuncNoHint =
03131                     normType == NORM_INF ?
03132                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_8u_C1R :
03133                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16u_C1R :
03134                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_16s_C1R :
03135                     type == CV_32FC1 ? (ippiNormRelFuncNoHint)ippiNormRel_Inf_32f_C1R :
03136                     0) :
03137                     normType == NORM_L1 ?
03138                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_8u_C1R :
03139                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16u_C1R :
03140                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L1_16s_C1R :
03141                     0) :
03142                     normType == NORM_L2 || normType == NORM_L2SQR ?
03143                     (type == CV_8UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_8u_C1R :
03144                     type == CV_16UC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16u_C1R :
03145                     type == CV_16SC1 ? (ippiNormRelFuncNoHint)ippiNormRel_L2_16s_C1R :
03146                     0) : 0;
03147                 ippiNormRelFuncHint ippFuncHint =
03148                     normType == NORM_L1 ?
03149                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L1_32f_C1R :
03150                     0) :
03151                     normType == NORM_L2 || normType == NORM_L2SQR ?
03152                     (type == CV_32FC1 ? (ippiNormRelFuncHint)ippiNormRel_L2_32f_C1R :
03153                     0) : 0;
03154                 if (ippFuncNoHint)
03155                 {
03156                     Ipp64f norm;
03157                     if( ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm) >= 0 )
03158                     {
03159                         result = (double)norm;
03160                         return true;
03161                     }
03162                 }
03163                 if (ippFuncHint)
03164                 {
03165                     Ipp64f norm;
03166                     if( ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, &norm, ippAlgHintAccurate) >= 0 )
03167                     {
03168                         result = (double)norm;
03169                         return true;
03170                     }
03171                 }
03172             }
03173         }
03174         return false;
03175     }
03176 
03177     normType &= 7;
03178     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
03179                normType == NORM_L2 || normType == NORM_L2SQR ||
03180               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
03181 
03182     size_t total_size = src1.total();
03183     int rows = src1.size[0], cols = rows ? (int)(total_size/rows) : 0;
03184     if( (src1.dims == 2 || (src1.isContinuous() && src2.isContinuous() && mask.isContinuous()))
03185         && cols > 0 && (size_t)rows*cols == total_size
03186         && (normType == NORM_INF || normType == NORM_L1 ||
03187             normType == NORM_L2 || normType == NORM_L2SQR) )
03188     {
03189         IppiSize sz = { cols, rows };
03190         int type = src1.type();
03191         if( !mask.empty() )
03192         {
03193             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC1)(const void *, int, const void *, int, const void *, int, IppiSize, Ipp64f *);
03194             ippiMaskNormDiffFuncC1 ippFuncC1 =
03195                 normType == NORM_INF ?
03196                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8u_C1MR :
03197 #if IPP_VERSION_X100 < 900
03198                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_8s_C1MR :
03199 #endif
03200                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_16u_C1MR :
03201                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_Inf_32f_C1MR :
03202                 0) :
03203                 normType == NORM_L1 ?
03204                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
03205 #if IPP_VERSION_X100 < 900
03206 #ifndef __APPLE__
03207                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
03208 #endif
03209 #endif
03210                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
03211                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
03212                 0) :
03213                 normType == NORM_L2 || normType == NORM_L2SQR ?
03214                 (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8u_C1MR :
03215 #if IPP_VERSION_X100 < 900
03216                 type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_8s_C1MR :
03217 #endif
03218                 type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_16u_C1MR :
03219                 type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L2_32f_C1MR :
03220                 0) : 0;
03221             if( ippFuncC1 )
03222             {
03223                 Ipp64f norm;
03224                 if( ippFuncC1(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], mask.ptr(), (int)mask.step[0], sz, &norm) >= 0 )
03225                 {
03226                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
03227                     return true;
03228                 }
03229             }
03230 #ifndef __APPLE__
03231             typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
03232             ippiMaskNormDiffFuncC3 ippFuncC3 =
03233                 normType == NORM_INF ?
03234                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
03235 #if IPP_VERSION_X100 < 900
03236                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8s_C3CMR :
03237 #endif
03238                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_16u_C3CMR :
03239                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_32f_C3CMR :
03240                 0) :
03241                 normType == NORM_L1 ?
03242                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8u_C3CMR :
03243 #if IPP_VERSION_X100 < 900
03244                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_8s_C3CMR :
03245 #endif
03246                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_16u_C3CMR :
03247                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L1_32f_C3CMR :
03248                 0) :
03249                 normType == NORM_L2 || normType == NORM_L2SQR ?
03250                 (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8u_C3CMR :
03251 #if IPP_VERSION_X100 < 900
03252                 type == CV_8SC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_8s_C3CMR :
03253 #endif
03254                 type == CV_16UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_16u_C3CMR :
03255                 type == CV_32FC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_L2_32f_C3CMR :
03256                 0) : 0;
03257             if( ippFuncC3 )
03258             {
03259                 Ipp64f norm1, norm2, norm3;
03260                 if( ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 1, &norm1) >= 0 &&
03261                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 2, &norm2) >= 0 &&
03262                     ippFuncC3(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], mask.data, (int)mask.step[0], sz, 3, &norm3) >= 0)
03263                 {
03264                     Ipp64f norm =
03265                         normType == NORM_INF ? std::max(std::max(norm1, norm2), norm3) :
03266                         normType == NORM_L1 ? norm1 + norm2 + norm3 :
03267                         normType == NORM_L2 || normType == NORM_L2SQR ? std::sqrt(norm1 * norm1 + norm2 * norm2 + norm3 * norm3) :
03268                         0;
03269                     result = (normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm);
03270                     return true;
03271                 }
03272             }
03273 #endif
03274         }
03275         else
03276         {
03277             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
03278             typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
03279             ippiNormDiffFuncHint ippFuncHint =
03280                 normType == NORM_L1 ?
03281                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
03282                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R :
03283                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R :
03284                 0) :
03285                 normType == NORM_L2 || normType == NORM_L2SQR ?
03286                 (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
03287                 type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R :
03288                 type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R :
03289                 0) : 0;
03290             ippiNormDiffFuncNoHint ippFuncNoHint =
03291                 normType == NORM_INF ?
03292                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
03293                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R :
03294                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R :
03295                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
03296                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R :
03297                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R :
03298                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
03299 #if (IPP_VERSION_X100 >= 810)
03300                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
03301                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
03302 #endif
03303                 type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
03304                 type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R :
03305                 type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R :
03306                 0) :
03307                 normType == NORM_L1 ?
03308                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
03309                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R :
03310                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R :
03311                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
03312                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R :
03313                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R :
03314 #if !(IPP_VERSION_X100 == 820 || IPP_VERSION_X100 == 821) // Oct 2014: Accuracy issue with IPP 8.2 / 8.2.1
03315                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
03316 #endif
03317                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R :
03318                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R :
03319                 0) :
03320                 normType == NORM_L2 || normType == NORM_L2SQR ?
03321                 (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
03322                 type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R :
03323                 type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R :
03324                 type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
03325                 type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R :
03326                 type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R :
03327                 type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
03328                 type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R :
03329                 type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R :
03330                 0) : 0;
03331             // Make sure only zero or one version of the function pointer is valid
03332             CV_Assert(!ippFuncHint || !ippFuncNoHint);
03333             if( ippFuncHint || ippFuncNoHint )
03334             {
03335                 Ipp64f norm_array[4];
03336                 IppStatus ret = ippFuncHint ? ippFuncHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) :
03337                                 ippFuncNoHint(src1.ptr(), (int)src1.step[0], src2.ptr(), (int)src2.step[0], sz, norm_array);
03338                 if( ret >= 0 )
03339                 {
03340                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
03341                     for( int i = 1; i < src1.channels(); i++ )
03342                     {
03343                         norm =
03344                             normType == NORM_INF ? std::max(norm, norm_array[i]) :
03345                             normType == NORM_L1 ? norm + norm_array[i] :
03346                             normType == NORM_L2 || normType == NORM_L2SQR ? norm + norm_array[i] * norm_array[i] :
03347                             0;
03348                     }
03349                     result = (normType == NORM_L2 ? (double)std::sqrt(norm) : (double)norm);
03350                     return true;
03351                 }
03352             }
03353         }
03354     }
03355 #else
03356     CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
03357 #endif
03358     return false;
03359 }
03360 }
03361 #endif
03362 
03363 
03364 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
03365 {
03366     CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
03367 
03368 #if defined HAVE_OPENCL || defined HAVE_IPP
03369     double _result = 0;
03370 #endif
03371 
03372 #ifdef HAVE_OPENCL
03373     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
03374                 ocl_norm(_src1, _src2, normType, _mask, _result),
03375                 _result)
03376 #endif
03377 
03378     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_norm(_src1, _src2, normType, _mask, _result), _result);
03379 
03380     if( normType & CV_RELATIVE )
03381     {
03382         return norm(_src1, _src2, normType & ~CV_RELATIVE, _mask)/(norm(_src2, normType, _mask) + DBL_EPSILON);
03383     }
03384 
03385     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
03386     int depth = src1.depth(), cn = src1.channels();
03387 
03388     normType &= 7;
03389     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
03390                normType == NORM_L2 || normType == NORM_L2SQR ||
03391               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
03392 
03393     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
03394     {
03395         size_t len = src1.total()*src1.channels();
03396         if( len == (size_t)(int)len )
03397         {
03398             if( src1.depth() == CV_32F )
03399             {
03400                 const float* data1 = src1.ptr<float>();
03401                 const float* data2 = src2.ptr<float>();
03402 
03403                 if( normType == NORM_L2 )
03404                 {
03405                     double result = 0;
03406                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
03407                     return std::sqrt(result);
03408                 }
03409                 if( normType == NORM_L2SQR )
03410                 {
03411                     double result = 0;
03412                     GET_OPTIMIZED(normDiffL2_32f)(data1, data2, 0, &result, (int)len, 1);
03413                     return result;
03414                 }
03415                 if( normType == NORM_L1 )
03416                 {
03417                     double result = 0;
03418                     GET_OPTIMIZED(normDiffL1_32f)(data1, data2, 0, &result, (int)len, 1);
03419                     return result;
03420                 }
03421                 if( normType == NORM_INF )
03422                 {
03423                     float result = 0;
03424                     GET_OPTIMIZED(normDiffInf_32f)(data1, data2, 0, &result, (int)len, 1);
03425                     return result;
03426                 }
03427             }
03428         }
03429     }
03430 
03431     CV_Assert( mask.empty() || mask.type() == CV_8U );
03432 
03433     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
03434     {
03435         if( !mask.empty() )
03436         {
03437             Mat temp;
03438             bitwise_xor(src1, src2, temp);
03439             bitwise_and(temp, mask, temp);
03440             return norm(temp, normType);
03441         }
03442         int cellSize = normType == NORM_HAMMING ? 1 : 2;
03443 
03444         const Mat* arrays[] = {&src1, &src2, 0};
03445         uchar* ptrs[2];
03446         NAryMatIterator it(arrays, ptrs);
03447         int total = (int)it.size;
03448         int result = 0;
03449 
03450         for( size_t i = 0; i < it.nplanes; i++, ++it )
03451         {
03452             result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
03453         }
03454 
03455         return result;
03456     }
03457 
03458     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
03459     CV_Assert( func != 0 );
03460 
03461     const Mat* arrays[] = {&src1, &src2, &mask, 0};
03462     uchar* ptrs[3];
03463     union
03464     {
03465         double d;
03466         float f;
03467         int i;
03468         unsigned u;
03469     }
03470     result;
03471     result.d = 0;
03472     NAryMatIterator it(arrays, ptrs);
03473     int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
03474     bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
03475             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
03476     unsigned isum = 0;
03477     unsigned *ibuf = &result.u;
03478     size_t esz = 0;
03479 
03480     if( blockSum )
03481     {
03482         intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
03483         blockSize = std::min(blockSize, intSumBlockSize);
03484         ibuf = &isum;
03485         esz = src1.elemSize();
03486     }
03487 
03488     for( size_t i = 0; i < it.nplanes; i++, ++it )
03489     {
03490         for( j = 0; j < total; j += blockSize )
03491         {
03492             int bsz = std::min(total - j, blockSize);
03493             func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
03494             count += bsz;
03495             if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
03496             {
03497                 result.d += isum;
03498                 isum = 0;
03499                 count = 0;
03500             }
03501             ptrs[0] += bsz*esz;
03502             ptrs[1] += bsz*esz;
03503             if( ptrs[2] )
03504                 ptrs[2] += bsz;
03505         }
03506     }
03507 
03508     if( normType == NORM_INF )
03509     {
03510         if( depth == CV_64F )
03511             ;
03512         else if( depth == CV_32F )
03513             result.d = result.f;
03514         else
03515             result.d = result.u;
03516     }
03517     else if( normType == NORM_L2 )
03518         result.d = std::sqrt(result.d);
03519 
03520     return result.d;
03521 }
03522 
03523 
03524 ///////////////////////////////////// batch distance ///////////////////////////////////////
03525 
03526 namespace cv
03527 {
03528 
03529 template<typename _Tp, typename _Rt>
03530 void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2,
03531                   int nvecs, int len, _Rt* dist, const uchar* mask)
03532 {
03533     step2 /= sizeof(src2[0]);
03534     if( !mask )
03535     {
03536         for( int i = 0; i < nvecs; i++ )
03537             dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len);
03538     }
03539     else
03540     {
03541         _Rt val0 = std::numeric_limits<_Rt>::max();
03542         for( int i = 0; i < nvecs; i++ )
03543             dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
03544     }
03545 }
03546 
03547 template<typename _Tp, typename _Rt>
03548 void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2,
03549                      int nvecs, int len, _Rt* dist, const uchar* mask)
03550 {
03551     step2 /= sizeof(src2[0]);
03552     if( !mask )
03553     {
03554         for( int i = 0; i < nvecs; i++ )
03555             dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len);
03556     }
03557     else
03558     {
03559         _Rt val0 = std::numeric_limits<_Rt>::max();
03560         for( int i = 0; i < nvecs; i++ )
03561             dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0;
03562     }
03563 }
03564 
03565 template<typename _Tp, typename _Rt>
03566 void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2,
03567                   int nvecs, int len, _Rt* dist, const uchar* mask)
03568 {
03569     step2 /= sizeof(src2[0]);
03570     if( !mask )
03571     {
03572         for( int i = 0; i < nvecs; i++ )
03573             dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len));
03574     }
03575     else
03576     {
03577         _Rt val0 = std::numeric_limits<_Rt>::max();
03578         for( int i = 0; i < nvecs; i++ )
03579             dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0;
03580     }
03581 }
03582 
03583 static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
03584                              int nvecs, int len, int* dist, const uchar* mask)
03585 {
03586     step2 /= sizeof(src2[0]);
03587     if( !mask )
03588     {
03589         for( int i = 0; i < nvecs; i++ )
03590              dist[i] = hal::normHamming(src1, src2 + step2*i, len);
03591     }
03592     else
03593     {
03594         int val0 = INT_MAX;
03595         for( int i = 0; i < nvecs; i++ )
03596         {
03597             if (mask[i])
03598                 dist[i] = hal::normHamming(src1, src2 + step2*i, len);
03599             else
03600                 dist[i] = val0;
03601         }
03602     }
03603 }
03604 
03605 static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2,
03606                               int nvecs, int len, int* dist, const uchar* mask)
03607 {
03608     step2 /= sizeof(src2[0]);
03609     if( !mask )
03610     {
03611         for( int i = 0; i < nvecs; i++ )
03612             dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
03613     }
03614     else
03615     {
03616         int val0 = INT_MAX;
03617         for( int i = 0; i < nvecs; i++ )
03618         {
03619             if (mask[i])
03620                 dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
03621             else
03622                 dist[i] = val0;
03623         }
03624     }
03625 }
03626 
03627 static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2,
03628                                int nvecs, int len, int* dist, const uchar* mask)
03629 {
03630     batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
03631 }
03632 
03633 static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2,
03634                                int nvecs, int len, float* dist, const uchar* mask)
03635 {
03636     batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
03637 }
03638 
03639 static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2,
03640                                   int nvecs, int len, int* dist, const uchar* mask)
03641 {
03642     batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask);
03643 }
03644 
03645 static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2,
03646                                   int nvecs, int len, float* dist, const uchar* mask)
03647 {
03648     batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
03649 }
03650 
03651 static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2,
03652                                int nvecs, int len, float* dist, const uchar* mask)
03653 {
03654     batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask);
03655 }
03656 
03657 static void batchDistL1_32f(const float* src1, const float* src2, size_t step2,
03658                              int nvecs, int len, float* dist, const uchar* mask)
03659 {
03660     batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
03661 }
03662 
03663 static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2,
03664                                 int nvecs, int len, float* dist, const uchar* mask)
03665 {
03666     batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
03667 }
03668 
03669 static void batchDistL2_32f(const float* src1, const float* src2, size_t step2,
03670                              int nvecs, int len, float* dist, const uchar* mask)
03671 {
03672     batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask);
03673 }
03674 
03675 typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2,
03676                               int nvecs, int len, uchar* dist, const uchar* mask);
03677 
03678 
03679 struct BatchDistInvoker : public ParallelLoopBody
03680 {
03681     BatchDistInvoker( const Mat& _src1, const Mat& _src2,
03682                       Mat& _dist, Mat& _nidx, int _K,
03683                       const Mat& _mask, int _update,
03684                       BatchDistFunc _func)
03685     {
03686         src1 = &_src1;
03687         src2 = &_src2;
03688         dist = &_dist;
03689         nidx = &_nidx;
03690         K = _K;
03691         mask = &_mask;
03692         update = _update;
03693         func = _func;
03694     }
03695 
03696     void operator()(const Range& range) const
03697     {
03698         AutoBuffer<int> buf(src2->rows);
03699         int* bufptr = buf;
03700 
03701         for( int i = range.start; i < range.end; i++ )
03702         {
03703             func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols,
03704                  K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0);
03705 
03706             if( K > 0 )
03707             {
03708                 int* nidxptr = nidx->ptr<int>(i);
03709                 // since positive float's can be compared just like int's,
03710                 // we handle both CV_32S and CV_32F cases with a single branch
03711                 int* distptr = (int*)dist->ptr(i);
03712 
03713                 int j, k;
03714 
03715                 for( j = 0; j < src2->rows; j++ )
03716                 {
03717                     int d = bufptr[j];
03718                     if( d < distptr[K-1] )
03719                     {
03720                         for( k = K-2; k >= 0 && distptr[k] > d; k-- )
03721                         {
03722                             nidxptr[k+1] = nidxptr[k];
03723                             distptr[k+1] = distptr[k];
03724                         }
03725                         nidxptr[k+1] = j + update;
03726                         distptr[k+1] = d;
03727                     }
03728                 }
03729             }
03730         }
03731     }
03732 
03733     const Mat *src1;
03734     const Mat *src2;
03735     Mat *dist;
03736     Mat *nidx;
03737     const Mat *mask;
03738     int K;
03739     int update;
03740     BatchDistFunc func;
03741 };
03742 
03743 }
03744 
03745 void cv::batchDistance( InputArray _src1, InputArray _src2,
03746                         OutputArray _dist, int dtype, OutputArray _nidx,
03747                         int normType, int K, InputArray _mask,
03748                         int update, bool crosscheck )
03749 {
03750     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
03751     int type = src1.type();
03752     CV_Assert( type == src2.type() && src1.cols == src2.cols &&
03753                (type == CV_32F || type == CV_8U));
03754     CV_Assert( _nidx.needed() == (K > 0) );
03755 
03756     if( dtype == -1 )
03757     {
03758         dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F;
03759     }
03760     CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F);
03761 
03762     K = std::min(K, src2.rows);
03763 
03764     _dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype);
03765     Mat dist = _dist.getMat(), nidx;
03766     if( _nidx.needed() )
03767     {
03768         _nidx.create(dist.size(), CV_32S);
03769         nidx = _nidx.getMat();
03770     }
03771 
03772     if( update == 0 && K > 0 )
03773     {
03774         dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX);
03775         nidx = Scalar::all(-1);
03776     }
03777 
03778     if( crosscheck )
03779     {
03780         CV_Assert( K == 1 && update == 0 && mask.empty() );
03781         Mat tdist, tidx;
03782         batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false);
03783 
03784         // if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
03785         // we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
03786         // As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
03787         // to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
03788         // If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
03789         // This O(N) procedure is called cross-check and it helps to eliminate some false matches.
03790         if( dtype == CV_32S )
03791         {
03792             for( int i = 0; i < tdist.rows; i++ )
03793             {
03794                 int idx = tidx.at<int>(i);
03795                 int d = tdist.at<int>(i), d0 = dist.at<int>(idx);
03796                 if( d < d0 )
03797                 {
03798                     dist.at<int>(idx) = d;
03799                     nidx.at<int>(idx) = i + update;
03800                 }
03801             }
03802         }
03803         else
03804         {
03805             for( int i = 0; i < tdist.rows; i++ )
03806             {
03807                 int idx = tidx.at<int>(i);
03808                 float d = tdist.at<float>(i), d0 = dist.at<float>(idx);
03809                 if( d < d0 )
03810                 {
03811                     dist.at<float>(idx) = d;
03812                     nidx.at<int>(idx) = i + update;
03813                 }
03814             }
03815         }
03816         return;
03817     }
03818 
03819     BatchDistFunc func = 0;
03820     if( type == CV_8U )
03821     {
03822         if( normType == NORM_L1 && dtype == CV_32S )
03823             func = (BatchDistFunc)batchDistL1_8u32s;
03824         else if( normType == NORM_L1 && dtype == CV_32F )
03825             func = (BatchDistFunc)batchDistL1_8u32f;
03826         else if( normType == NORM_L2SQR && dtype == CV_32S )
03827             func = (BatchDistFunc)batchDistL2Sqr_8u32s;
03828         else if( normType == NORM_L2SQR && dtype == CV_32F )
03829             func = (BatchDistFunc)batchDistL2Sqr_8u32f;
03830         else if( normType == NORM_L2 && dtype == CV_32F )
03831             func = (BatchDistFunc)batchDistL2_8u32f;
03832         else if( normType == NORM_HAMMING && dtype == CV_32S )
03833             func = (BatchDistFunc)batchDistHamming;
03834         else if( normType == NORM_HAMMING2 && dtype == CV_32S )
03835             func = (BatchDistFunc)batchDistHamming2;
03836     }
03837     else if( type == CV_32F && dtype == CV_32F )
03838     {
03839         if( normType == NORM_L1 )
03840             func = (BatchDistFunc)batchDistL1_32f;
03841         else if( normType == NORM_L2SQR )
03842             func = (BatchDistFunc)batchDistL2Sqr_32f;
03843         else if( normType == NORM_L2 )
03844             func = (BatchDistFunc)batchDistL2_32f;
03845     }
03846 
03847     if( func == 0 )
03848         CV_Error_(CV_StsUnsupportedFormat,
03849                   ("The combination of type=%d, dtype=%d and normType=%d is not supported",
03850                    type, dtype, normType));
03851 
03852     parallel_for_(Range(0, src1.rows),
03853                   BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func));
03854 }
03855 
03856 
03857 void cv::findNonZero( InputArray _src, OutputArray _idx )
03858 {
03859     Mat src = _src.getMat();
03860     CV_Assert( src.type() == CV_8UC1 );
03861     int n = countNonZero(src);
03862     if( n == 0 )
03863     {
03864         _idx.release();
03865         return;
03866     }
03867     if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() )
03868         _idx.release();
03869     _idx.create(n, 1, CV_32SC2);
03870     Mat idx = _idx.getMat();
03871     CV_Assert(idx.isContinuous());
03872     Point* idx_ptr = idx.ptr<Point>();
03873 
03874     for( int i = 0; i < src.rows; i++ )
03875     {
03876         const uchar* bin_ptr = src.ptr(i);
03877         for( int j = 0; j < src.cols; j++ )
03878             if( bin_ptr[j] )
03879                 *idx_ptr++ = Point(j, i);
03880     }
03881 }
03882 
03883 double cv::PSNR(InputArray _src1, InputArray _src2)
03884 {
03885     CV_Assert( _src1.depth() == CV_8U );
03886     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
03887     return 20*log10(255./(diff+DBL_EPSILON));
03888 }
03889 
03890 
03891 CV_IMPL CvScalar  cvSum( const CvArr* srcarr )
03892 {
03893     cv::Scalar  sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
03894     if( CV_IS_IMAGE(srcarr) )
03895     {
03896         int coi = cvGetImageCOI((IplImage*)srcarr);
03897         if( coi )
03898         {
03899             CV_Assert( 0 < coi && coi <= 4 );
03900             sum = cv::Scalar (sum[coi-1]);
03901         }
03902     }
03903     return sum;
03904 }
03905 
03906 CV_IMPL int cvCountNonZero( const CvArr* imgarr )
03907 {
03908     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
03909     if( img.channels() > 1 )
03910         cv::extractImageCOI(imgarr, img);
03911     return countNonZero(img);
03912 }
03913 
03914 
03915 CV_IMPL  CvScalar 
03916 cvAvg( const void* imgarr, const void* maskarr )
03917 {
03918     cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1);
03919     cv::Scalar  mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr));
03920     if( CV_IS_IMAGE(imgarr) )
03921     {
03922         int coi = cvGetImageCOI((IplImage*)imgarr);
03923         if( coi )
03924         {
03925             CV_Assert( 0 < coi && coi <= 4 );
03926             mean = cv::Scalar (mean[coi-1]);
03927         }
03928     }
03929     return mean;
03930 }
03931 
03932 
03933 CV_IMPL  void
03934 cvAvgSdv( const CvArr* imgarr, CvScalar * _mean, CvScalar * _sdv, const void* maskarr )
03935 {
03936     cv::Scalar  mean, sdv;
03937 
03938     cv::Mat mask;
03939     if( maskarr )
03940         mask = cv::cvarrToMat(maskarr);
03941 
03942     cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask );
03943 
03944     if( CV_IS_IMAGE(imgarr) )
03945     {
03946         int coi = cvGetImageCOI((IplImage*)imgarr);
03947         if( coi )
03948         {
03949             CV_Assert( 0 < coi && coi <= 4 );
03950             mean = cv::Scalar (mean[coi-1]);
03951             sdv = cv::Scalar (sdv[coi-1]);
03952         }
03953     }
03954 
03955     if( _mean )
03956         *(cv::Scalar *)_mean = mean;
03957     if( _sdv )
03958         *(cv::Scalar *)_sdv = sdv;
03959 }
03960 
03961 
03962 CV_IMPL void
03963 cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal,
03964              CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr )
03965 {
03966     cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1);
03967     if( maskarr )
03968         mask = cv::cvarrToMat(maskarr);
03969     if( img.channels() > 1 )
03970         cv::extractImageCOI(imgarr, img);
03971 
03972     cv::minMaxLoc( img, _minVal, _maxVal,
03973                    (cv::Point *)_minLoc, (cv::Point *)_maxLoc, mask );
03974 }
03975 
03976 
03977 CV_IMPL  double
03978 cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
03979 {
03980     cv::Mat a, mask;
03981     if( !imgA )
03982     {
03983         imgA = imgB;
03984         imgB = 0;
03985     }
03986 
03987     a = cv::cvarrToMat(imgA, false, true, 1);
03988     if( maskarr )
03989         mask = cv::cvarrToMat(maskarr);
03990 
03991     if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 )
03992         cv::extractImageCOI(imgA, a);
03993 
03994     if( !imgB )
03995         return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask);
03996 
03997     cv::Mat b = cv::cvarrToMat(imgB, false, true, 1);
03998     if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 )
03999         cv::extractImageCOI(imgB, b);
04000 
04001     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
04002 }
04003 
04004 namespace cv { namespace hal {
04005 
04006 static const uchar popCountTable[] =
04007 {
04008     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
04009     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
04010     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
04011     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
04012     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
04013     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
04014     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
04015     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
04016 };
04017 
04018 static const uchar popCountTable2[] =
04019 {
04020     0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
04021     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
04022     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
04023     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
04024     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
04025     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
04026     1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
04027     2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
04028 };
04029 
04030 static const uchar popCountTable4[] =
04031 {
04032     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04033     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04034     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04035     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04036     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04037     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04038     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
04039     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
04040 };
04041 
04042 int normHamming(const uchar* a, int n)
04043 {
04044     int i = 0;
04045     int result = 0;
04046 #if CV_NEON
04047     {
04048         uint32x4_t bits = vmovq_n_u32(0);
04049         for (; i <= n - 16; i += 16) {
04050             uint8x16_t A_vec = vld1q_u8 (a + i);
04051             uint8x16_t bitsSet = vcntq_u8 (A_vec);
04052             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
04053             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
04054             bits = vaddq_u32(bits, bitSet4);
04055         }
04056         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
04057         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
04058         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
04059     }
04060 #endif
04061         for( ; i <= n - 4; i += 4 )
04062             result += popCountTable[a[i]] + popCountTable[a[i+1]] +
04063             popCountTable[a[i+2]] + popCountTable[a[i+3]];
04064     for( ; i < n; i++ )
04065         result += popCountTable[a[i]];
04066     return result;
04067 }
04068 
04069 int normHamming(const uchar* a, const uchar* b, int n)
04070 {
04071     int i = 0;
04072     int result = 0;
04073 #if CV_NEON
04074     {
04075         uint32x4_t bits = vmovq_n_u32(0);
04076         for (; i <= n - 16; i += 16) {
04077             uint8x16_t A_vec = vld1q_u8 (a + i);
04078             uint8x16_t B_vec = vld1q_u8 (b + i);
04079             uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
04080             uint8x16_t bitsSet = vcntq_u8 (AxorB);
04081             uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
04082             uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
04083             bits = vaddq_u32(bits, bitSet4);
04084         }
04085         uint64x2_t bitSet2 = vpaddlq_u32 (bits);
04086         result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
04087         result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
04088     }
04089 #endif
04090         for( ; i <= n - 4; i += 4 )
04091             result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
04092                     popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
04093     for( ; i < n; i++ )
04094         result += popCountTable[a[i] ^ b[i]];
04095     return result;
04096 }
04097 
04098 int normHamming(const uchar* a, int n, int cellSize)
04099 {
04100     if( cellSize == 1 )
04101         return normHamming(a, n);
04102     const uchar* tab = 0;
04103     if( cellSize == 2 )
04104         tab = popCountTable2;
04105     else if( cellSize == 4 )
04106         tab = popCountTable4;
04107     else
04108         return -1;
04109     int i = 0;
04110     int result = 0;
04111 #if CV_ENABLE_UNROLLED
04112     for( ; i <= n - 4; i += 4 )
04113         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
04114 #endif
04115     for( ; i < n; i++ )
04116         result += tab[a[i]];
04117     return result;
04118 }
04119 
04120 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
04121 {
04122     if( cellSize == 1 )
04123         return normHamming(a, b, n);
04124     const uchar* tab = 0;
04125     if( cellSize == 2 )
04126         tab = popCountTable2;
04127     else if( cellSize == 4 )
04128         tab = popCountTable4;
04129     else
04130         return -1;
04131     int i = 0;
04132     int result = 0;
04133     #if CV_ENABLE_UNROLLED
04134     for( ; i <= n - 4; i += 4 )
04135         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
04136                 tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
04137     #endif
04138     for( ; i < n; i++ )
04139         result += tab[a[i] ^ b[i]];
04140     return result;
04141 }
04142 
04143 float normL2Sqr_(const float* a, const float* b, int n)
04144 {
04145     int j = 0; float d = 0.f;
04146 #if CV_SSE
04147     float CV_DECL_ALIGNED(16) buf[4];
04148     __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
04149 
04150     for( ; j <= n - 8; j += 8 )
04151     {
04152         __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
04153         __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
04154         d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
04155         d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
04156     }
04157     _mm_store_ps(buf, _mm_add_ps(d0, d1));
04158     d = buf[0] + buf[1] + buf[2] + buf[3];
04159 #endif
04160     {
04161         for( ; j <= n - 4; j += 4 )
04162         {
04163             float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
04164             d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
04165         }
04166     }
04167 
04168     for( ; j < n; j++ )
04169     {
04170         float t = a[j] - b[j];
04171         d += t*t;
04172     }
04173     return d;
04174 }
04175 
04176 
04177 float normL1_(const float* a, const float* b, int n)
04178 {
04179     int j = 0; float d = 0.f;
04180 #if CV_SSE
04181     float CV_DECL_ALIGNED(16) buf[4];
04182     static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
04183     __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
04184     __m128 absmask = _mm_load_ps((const float*)absbuf);
04185 
04186     for( ; j <= n - 8; j += 8 )
04187     {
04188         __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
04189         __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
04190         d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
04191         d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
04192     }
04193     _mm_store_ps(buf, _mm_add_ps(d0, d1));
04194     d = buf[0] + buf[1] + buf[2] + buf[3];
04195 #elif CV_NEON
04196     float32x4_t v_sum = vdupq_n_f32(0.0f);
04197     for ( ; j <= n - 4; j += 4)
04198         v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
04199 
04200     float CV_DECL_ALIGNED(16) buf[4];
04201     vst1q_f32(buf, v_sum);
04202     d = buf[0] + buf[1] + buf[2] + buf[3];
04203 #endif
04204     {
04205         for( ; j <= n - 4; j += 4 )
04206         {
04207             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
04208             std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
04209         }
04210     }
04211 
04212     for( ; j < n; j++ )
04213         d += std::abs(a[j] - b[j]);
04214     return d;
04215 }
04216 
04217 int normL1_(const uchar* a, const uchar* b, int n)
04218 {
04219     int j = 0, d = 0;
04220 #if CV_SSE
04221     __m128i d0 = _mm_setzero_si128();
04222 
04223     for( ; j <= n - 16; j += 16 )
04224     {
04225         __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
04226         __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
04227 
04228         d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
04229     }
04230 
04231     for( ; j <= n - 4; j += 4 )
04232     {
04233         __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
04234         __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
04235 
04236         d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
04237     }
04238     d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
04239 #elif CV_NEON
04240     uint32x4_t v_sum = vdupq_n_u32(0.0f);
04241     for ( ; j <= n - 16; j += 16)
04242     {
04243         uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
04244         uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
04245         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
04246         v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
04247     }
04248 
04249     uint CV_DECL_ALIGNED(16) buf[4];
04250     vst1q_u32(buf, v_sum);
04251     d = buf[0] + buf[1] + buf[2] + buf[3];
04252 #endif
04253     {
04254         for( ; j <= n - 4; j += 4 )
04255         {
04256             d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
04257             std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
04258         }
04259     }
04260     for( ; j < n; j++ )
04261         d += std::abs(a[j] - b[j]);
04262     return d;
04263 }
04264 
04265 }} //cv::hal
04266
Repository toolbox

Repository details

Type:	Program
Created:	04 Jul 2017
Imports:	5
Forks:	1
Commits:	171
Dependents:	0
Dependencies:	0
Followers:	4
Important changes to repositories hosted on mbed.com

stat.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

stat.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning