gr-peach-opencv-project

Users » thedo » Code » gr-peach-opencv-project » Documentation
Fork of gr-peach-opencv-project by the do
Embed: (wiki syntax)
Show/hide line numbers smooth.cpp Source File
00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                           License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
00016 // Third party copyrights are property of their respective owners.
00017 //
00018 // Redistribution and use in source and binary forms, with or without modification,
00019 // are permitted provided that the following conditions are met:
00020 //
00021 //   * Redistribution's of source code must retain the above copyright notice,
00022 //     this list of conditions and the following disclaimer.
00023 //
00024 //   * Redistribution's in binary form must reproduce the above copyright notice,
00025 //     this list of conditions and the following disclaimer in the documentation
00026 //     and/or other materials provided with the distribution.
00027 //
00028 //   * The name of the copyright holders may not be used to endorse or promote products
00029 //     derived from this software without specific prior written permission.
00030 //
00031 // This software is provided by the copyright holders and contributors "as is" and
00032 // any express or implied warranties, including, but not limited to, the implied
00033 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00034 // In no event shall the Intel Corporation or contributors be liable for any direct,
00035 // indirect, incidental, special, exemplary, or consequential damages
00036 // (including, but not limited to, procurement of substitute goods or services;
00037 // loss of use, data, or profits; or business interruption) however caused
00038 // and on any theory of liability, whether in contract, strict liability,
00039 // or tort (including negligence or otherwise) arising in any way out of
00040 // the use of this software, even if advised of the possibility of such damage.
00041 //
00042 //M*/
00043 
00044 #include "precomp.hpp"
00045 #include "opencl_kernels_imgproc.hpp"
00046 
00047 /*
00048  * This file includes the code, contributed by Simon Perreault
00049  * (the function icvMedianBlur_8u_O1)
00050  *
00051  * Constant-time median filtering -- http://nomis80.org/ctmf.html
00052  * Copyright (C) 2006 Simon Perreault
00053  *
00054  * Contact:
00055  *  Laboratoire de vision et systemes numeriques
00056  *  Pavillon Adrien-Pouliot
00057  *  Universite Laval
00058  *  Sainte-Foy, Quebec, Canada
00059  *  G1K 7P4
00060  *
00061  *  perreaul@gel.ulaval.ca
00062  */
00063 
00064 namespace cv
00065 {
00066 
00067 /****************************************************************************************\
00068                                          Box Filter
00069 \****************************************************************************************/
00070 
00071 template<typename T, typename ST>
00072 struct RowSum :
00073         public BaseRowFilter
00074 {
00075     RowSum( int _ksize, int _anchor ) :
00076         BaseRowFilter()
00077     {
00078         ksize = _ksize;
00079         anchor = _anchor;
00080     }
00081 
00082     virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
00083     {
00084         const T* S = (const T*)src;
00085         ST* D = (ST*)dst;
00086         int i = 0, k, ksz_cn = ksize*cn;
00087 
00088         width = (width - 1)*cn;
00089         for( k = 0; k < cn; k++, S++, D++ )
00090         {
00091             ST s = 0;
00092             for( i = 0; i < ksz_cn; i += cn )
00093                 s += S[i];
00094             D[0] = s;
00095             for( i = 0; i < width; i += cn )
00096             {
00097                 s += S[i + ksz_cn] - S[i];
00098                 D[i+cn] = s;
00099             }
00100         }
00101     }
00102 };
00103 
00104 
00105 template<typename ST, typename T>
00106 struct ColumnSum :
00107         public BaseColumnFilter
00108 {
00109     ColumnSum( int _ksize, int _anchor, double _scale ) :
00110         BaseColumnFilter()
00111     {
00112         ksize = _ksize;
00113         anchor = _anchor;
00114         scale = _scale;
00115         sumCount = 0;
00116     }
00117 
00118     virtual void reset() { sumCount = 0; }
00119 
00120     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00121     {
00122         int i;
00123         ST* SUM;
00124         bool haveScale = scale != 1;
00125         double _scale = scale;
00126 
00127         if( width != (int)sum.size() )
00128         {
00129             sum.resize(width);
00130             sumCount = 0;
00131         }
00132 
00133         SUM = &sum[0];
00134         if( sumCount == 0 )
00135         {
00136             memset((void*)SUM, 0, width*sizeof(ST));
00137 
00138             for( ; sumCount < ksize - 1; sumCount++, src++ )
00139             {
00140                 const ST* Sp = (const ST*)src[0];
00141                 for( i = 0; i <= width - 2; i += 2 )
00142                 {
00143                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
00144                     SUM[i] = s0; SUM[i+1] = s1;
00145                 }
00146 
00147                 for( ; i < width; i++ )
00148                     SUM[i] += Sp[i];
00149             }
00150         }
00151         else
00152         {
00153             CV_Assert( sumCount == ksize-1 );
00154             src += ksize-1;
00155         }
00156 
00157         for( ; count--; src++ )
00158         {
00159             const ST* Sp = (const ST*)src[0];
00160             const ST* Sm = (const ST*)src[1-ksize];
00161             T* D = (T*)dst;
00162             if( haveScale )
00163             {
00164                 for( i = 0; i <= width - 2; i += 2 )
00165                 {
00166                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
00167                     D[i] = saturate_cast<T>(s0*_scale);
00168                     D[i+1] = saturate_cast<T>(s1*_scale);
00169                     s0 -= Sm[i]; s1 -= Sm[i+1];
00170                     SUM[i] = s0; SUM[i+1] = s1;
00171                 }
00172 
00173                 for( ; i < width; i++ )
00174                 {
00175                     ST s0 = SUM[i] + Sp[i];
00176                     D[i] = saturate_cast<T>(s0*_scale);
00177                     SUM[i] = s0 - Sm[i];
00178                 }
00179             }
00180             else
00181             {
00182                 for( i = 0; i <= width - 2; i += 2 )
00183                 {
00184                     ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
00185                     D[i] = saturate_cast<T>(s0);
00186                     D[i+1] = saturate_cast<T>(s1);
00187                     s0 -= Sm[i]; s1 -= Sm[i+1];
00188                     SUM[i] = s0; SUM[i+1] = s1;
00189                 }
00190 
00191                 for( ; i < width; i++ )
00192                 {
00193                     ST s0 = SUM[i] + Sp[i];
00194                     D[i] = saturate_cast<T>(s0);
00195                     SUM[i] = s0 - Sm[i];
00196                 }
00197             }
00198             dst += dststep;
00199         }
00200     }
00201 
00202     double scale;
00203     int sumCount;
00204     std::vector<ST> sum;
00205 };
00206 
00207 
00208 template<>
00209 struct ColumnSum<int, uchar> :
00210         public BaseColumnFilter
00211 {
00212     ColumnSum( int _ksize, int _anchor, double _scale ) :
00213         BaseColumnFilter()
00214     {
00215         ksize = _ksize;
00216         anchor = _anchor;
00217         scale = _scale;
00218         sumCount = 0;
00219     }
00220 
00221     virtual void reset() { sumCount = 0; }
00222 
00223     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00224     {
00225         int i;
00226         int* SUM;
00227         bool haveScale = scale != 1;
00228         double _scale = scale;
00229 
00230         #if CV_SSE2
00231             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
00232         #endif
00233 
00234         if( width != (int)sum.size() )
00235         {
00236             sum.resize(width);
00237             sumCount = 0;
00238         }
00239 
00240         SUM = &sum[0];
00241         if( sumCount == 0 )
00242         {
00243             memset((void*)SUM, 0, width*sizeof(int));
00244             for( ; sumCount < ksize - 1; sumCount++, src++ )
00245             {
00246                 const int* Sp = (const int*)src[0];
00247                 i = 0;
00248                 #if CV_SSE2
00249                 if(haveSSE2)
00250                 {
00251                     for( ; i <= width-4; i+=4 )
00252                     {
00253                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00254                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00255                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
00256                     }
00257                 }
00258                 #elif CV_NEON
00259                 for( ; i <= width - 4; i+=4 )
00260                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00261                 #endif
00262                 for( ; i < width; i++ )
00263                     SUM[i] += Sp[i];
00264             }
00265         }
00266         else
00267         {
00268             CV_Assert( sumCount == ksize-1 );
00269             src += ksize-1;
00270         }
00271 
00272         for( ; count--; src++ )
00273         {
00274             const int* Sp = (const int*)src[0];
00275             const int* Sm = (const int*)src[1-ksize];
00276             uchar* D = (uchar*)dst;
00277             if( haveScale )
00278             {
00279                 i = 0;
00280                 #if CV_SSE2
00281                 if(haveSSE2)
00282                 {
00283                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00284                     for( ; i <= width-8; i+=8 )
00285                     {
00286                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00287                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00288 
00289                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00290                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00291                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00292                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00293 
00294                         __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00295                         __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
00296 
00297                         _s0T = _mm_packs_epi32(_s0T, _s0T1);
00298 
00299                         _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
00300 
00301                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00302                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
00303                     }
00304                 }
00305                 #elif CV_NEON
00306                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00307                 for( ; i <= width-8; i+=8 )
00308                 {
00309                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00310                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00311 
00312                     uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00313                     uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00314 
00315                     uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
00316                     vst1_u8(D + i, vqmovn_u16(v_dst));
00317 
00318                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00319                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00320                 }
00321                 #endif
00322                 for( ; i < width; i++ )
00323                 {
00324                     int s0 = SUM[i] + Sp[i];
00325                     D[i] = saturate_cast<uchar>(s0*_scale);
00326                     SUM[i] = s0 - Sm[i];
00327                 }
00328             }
00329             else
00330             {
00331                 i = 0;
00332                 #if CV_SSE2
00333                 if(haveSSE2)
00334                 {
00335                     for( ; i <= width-8; i+=8 )
00336                     {
00337                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00338                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00339 
00340                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00341                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00342                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00343                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00344 
00345                         __m128i _s0T = _mm_packs_epi32(_s0, _s01);
00346 
00347                         _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T));
00348 
00349                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00350                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
00351                     }
00352                 }
00353                 #elif CV_NEON
00354                 for( ; i <= width-8; i+=8 )
00355                 {
00356                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00357                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00358 
00359                     uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
00360                     vst1_u8(D + i, vqmovn_u16(v_dst));
00361 
00362                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00363                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00364                 }
00365                 #endif
00366 
00367                 for( ; i < width; i++ )
00368                 {
00369                     int s0 = SUM[i] + Sp[i];
00370                     D[i] = saturate_cast<uchar>(s0);
00371                     SUM[i] = s0 - Sm[i];
00372                 }
00373             }
00374             dst += dststep;
00375         }
00376     }
00377 
00378     double scale;
00379     int sumCount;
00380     std::vector<int> sum;
00381 };
00382 
00383 template<>
00384 struct ColumnSum<int, short> :
00385         public BaseColumnFilter
00386 {
00387     ColumnSum( int _ksize, int _anchor, double _scale ) :
00388         BaseColumnFilter()
00389     {
00390         ksize = _ksize;
00391         anchor = _anchor;
00392         scale = _scale;
00393         sumCount = 0;
00394     }
00395 
00396     virtual void reset() { sumCount = 0; }
00397 
00398     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00399     {
00400         int i;
00401         int* SUM;
00402         bool haveScale = scale != 1;
00403         double _scale = scale;
00404 
00405         #if CV_SSE2
00406             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
00407         #endif
00408 
00409         if( width != (int)sum.size() )
00410         {
00411             sum.resize(width);
00412             sumCount = 0;
00413         }
00414         SUM = &sum[0];
00415         if( sumCount == 0 )
00416         {
00417             memset((void*)SUM, 0, width*sizeof(int));
00418             for( ; sumCount < ksize - 1; sumCount++, src++ )
00419             {
00420                 const int* Sp = (const int*)src[0];
00421                 i = 0;
00422                 #if CV_SSE2
00423                 if(haveSSE2)
00424                 {
00425                     for( ; i <= width-4; i+=4 )
00426                     {
00427                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00428                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00429                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
00430                     }
00431                 }
00432                 #elif CV_NEON
00433                 for( ; i <= width - 4; i+=4 )
00434                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00435                 #endif
00436                 for( ; i < width; i++ )
00437                     SUM[i] += Sp[i];
00438             }
00439         }
00440         else
00441         {
00442             CV_Assert( sumCount == ksize-1 );
00443             src += ksize-1;
00444         }
00445 
00446         for( ; count--; src++ )
00447         {
00448             const int* Sp = (const int*)src[0];
00449             const int* Sm = (const int*)src[1-ksize];
00450             short* D = (short*)dst;
00451             if( haveScale )
00452             {
00453                 i = 0;
00454                 #if CV_SSE2
00455                 if(haveSSE2)
00456                 {
00457                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00458                     for( ; i <= width-8; i+=8 )
00459                     {
00460                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00461                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00462 
00463                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00464                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00465                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00466                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00467 
00468                         __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00469                         __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01)));
00470 
00471                         _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1));
00472 
00473                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
00474                         _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
00475                     }
00476                 }
00477                 #elif CV_NEON
00478                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00479                 for( ; i <= width-8; i+=8 )
00480                 {
00481                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00482                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00483 
00484                     int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00485                     int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00486                     vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
00487 
00488                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00489                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00490                 }
00491                 #endif
00492                 for( ; i < width; i++ )
00493                 {
00494                     int s0 = SUM[i] + Sp[i];
00495                     D[i] = saturate_cast<short>(s0*_scale);
00496                     SUM[i] = s0 - Sm[i];
00497                 }
00498             }
00499             else
00500             {
00501                 i = 0;
00502                 #if CV_SSE2
00503                 if(haveSSE2)
00504                 {
00505                     for( ; i <= width-8; i+=8 )
00506                     {
00507 
00508                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00509                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
00510 
00511                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00512                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00513                         __m128i _s01  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)),
00514                                                       _mm_loadu_si128((const __m128i*)(Sp+i+4)));
00515 
00516                         _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01));
00517 
00518                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00519                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
00520                     }
00521                 }
00522                 #elif CV_NEON
00523                 for( ; i <= width-8; i+=8 )
00524                 {
00525                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00526                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00527 
00528                     vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
00529 
00530                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00531                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00532                 }
00533                 #endif
00534 
00535                 for( ; i < width; i++ )
00536                 {
00537                     int s0 = SUM[i] + Sp[i];
00538                     D[i] = saturate_cast<short>(s0);
00539                     SUM[i] = s0 - Sm[i];
00540                 }
00541             }
00542             dst += dststep;
00543         }
00544     }
00545 
00546     double scale;
00547     int sumCount;
00548     std::vector<int> sum;
00549 };
00550 
00551 
00552 template<>
00553 struct ColumnSum<int, ushort> :
00554         public BaseColumnFilter
00555 {
00556     ColumnSum( int _ksize, int _anchor, double _scale ) :
00557         BaseColumnFilter()
00558     {
00559         ksize = _ksize;
00560         anchor = _anchor;
00561         scale = _scale;
00562         sumCount = 0;
00563     }
00564 
00565     virtual void reset() { sumCount = 0; }
00566 
00567     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00568     {
00569         int i;
00570         int* SUM;
00571         bool haveScale = scale != 1;
00572         double _scale = scale;
00573         #if CV_SSE2
00574                 bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
00575         #endif
00576 
00577         if( width != (int)sum.size() )
00578         {
00579             sum.resize(width);
00580             sumCount = 0;
00581         }
00582         SUM = &sum[0];
00583         if( sumCount == 0 )
00584         {
00585             memset((void*)SUM, 0, width*sizeof(int));
00586             for( ; sumCount < ksize - 1; sumCount++, src++ )
00587             {
00588                 const int* Sp = (const int*)src[0];
00589                 i = 0;
00590                 #if CV_SSE2
00591                 if(haveSSE2)
00592                 {
00593                     for( ; i < width-4; i+=4 )
00594                     {
00595                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00596                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00597                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
00598                     }
00599                 }
00600                 #elif CV_NEON
00601                 for( ; i <= width - 4; i+=4 )
00602                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00603                 #endif
00604                 for( ; i < width; i++ )
00605                     SUM[i] += Sp[i];
00606             }
00607         }
00608         else
00609         {
00610             CV_Assert( sumCount == ksize-1 );
00611             src += ksize-1;
00612         }
00613 
00614         for( ; count--; src++ )
00615         {
00616             const int* Sp = (const int*)src[0];
00617             const int* Sm = (const int*)src[1-ksize];
00618             ushort* D = (ushort*)dst;
00619             if( haveScale )
00620             {
00621                 i = 0;
00622                 #if CV_SSE2
00623                 if(haveSSE2)
00624                 {
00625                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00626                     const __m128i delta0 = _mm_set1_epi32(0x8000);
00627                     const __m128i delta1 = _mm_set1_epi32(0x80008000);
00628 
00629                     for( ; i < width-4; i+=4)
00630                     {
00631                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00632                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00633                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00634 
00635                         __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00636 
00637                         _res = _mm_sub_epi32(_res, delta0);
00638                         _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
00639 
00640                         _mm_storel_epi64((__m128i*)(D+i), _res);
00641                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00642                     }
00643                 }
00644                 #elif CV_NEON
00645                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00646                 for( ; i <= width-8; i+=8 )
00647                 {
00648                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00649                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00650 
00651                     uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00652                     uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00653                     vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
00654 
00655                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00656                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00657                 }
00658                 #endif
00659                 for( ; i < width; i++ )
00660                 {
00661                     int s0 = SUM[i] + Sp[i];
00662                     D[i] = saturate_cast<ushort>(s0*_scale);
00663                     SUM[i] = s0 - Sm[i];
00664                 }
00665             }
00666             else
00667             {
00668                 i = 0;
00669                 #if  CV_SSE2
00670                 if(haveSSE2)
00671                 {
00672                     const __m128i delta0 = _mm_set1_epi32(0x8000);
00673                     const __m128i delta1 = _mm_set1_epi32(0x80008000);
00674 
00675                     for( ; i < width-4; i+=4 )
00676                     {
00677                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00678                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00679                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00680 
00681                         __m128i _res = _mm_sub_epi32(_s0, delta0);
00682                         _res = _mm_add_epi16(_mm_packs_epi32(_res, _res), delta1);
00683 
00684                         _mm_storel_epi64((__m128i*)(D+i), _res);
00685                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00686                     }
00687                 }
00688                 #elif CV_NEON
00689                 for( ; i <= width-8; i+=8 )
00690                 {
00691                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00692                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00693 
00694                     vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
00695 
00696                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00697                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00698                 }
00699                 #endif
00700 
00701                 for( ; i < width; i++ )
00702                 {
00703                     int s0 = SUM[i] + Sp[i];
00704                     D[i] = saturate_cast<ushort>(s0);
00705                     SUM[i] = s0 - Sm[i];
00706                 }
00707             }
00708             dst += dststep;
00709         }
00710     }
00711 
00712     double scale;
00713     int sumCount;
00714     std::vector<int> sum;
00715 };
00716 
00717 template<>
00718 struct ColumnSum<int, int> :
00719         public BaseColumnFilter
00720 {
00721     ColumnSum( int _ksize, int _anchor, double _scale ) :
00722         BaseColumnFilter()
00723     {
00724         ksize = _ksize;
00725         anchor = _anchor;
00726         scale = _scale;
00727         sumCount = 0;
00728     }
00729 
00730     virtual void reset() { sumCount = 0; }
00731 
00732     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00733     {
00734         int i;
00735         int* SUM;
00736         bool haveScale = scale != 1;
00737         double _scale = scale;
00738 
00739         #if CV_SSE2
00740             bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
00741         #endif
00742 
00743         if( width != (int)sum.size() )
00744         {
00745             sum.resize(width);
00746             sumCount = 0;
00747         }
00748         SUM = &sum[0];
00749         if( sumCount == 0 )
00750         {
00751             memset((void*)SUM, 0, width*sizeof(int));
00752             for( ; sumCount < ksize - 1; sumCount++, src++ )
00753             {
00754                 const int* Sp = (const int*)src[0];
00755                 i = 0;
00756                 #if CV_SSE2
00757                 if(haveSSE2)
00758                 {
00759                     for( ; i <= width-4; i+=4 )
00760                     {
00761                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00762                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00763                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
00764                     }
00765                 }
00766                 #elif CV_NEON
00767                 for( ; i <= width - 4; i+=4 )
00768                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00769                 #endif
00770                 for( ; i < width; i++ )
00771                     SUM[i] += Sp[i];
00772             }
00773         }
00774         else
00775         {
00776             CV_Assert( sumCount == ksize-1 );
00777             src += ksize-1;
00778         }
00779 
00780         for( ; count--; src++ )
00781         {
00782             const int* Sp = (const int*)src[0];
00783             const int* Sm = (const int*)src[1-ksize];
00784             int* D = (int*)dst;
00785             if( haveScale )
00786             {
00787                 i = 0;
00788                 #if CV_SSE2
00789                 if(haveSSE2)
00790                 {
00791                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00792                     for( ; i <= width-4; i+=4 )
00793                     {
00794                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00795 
00796                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00797                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00798 
00799                         __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00800 
00801                         _mm_storeu_si128((__m128i*)(D+i), _s0T);
00802                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
00803                     }
00804                 }
00805                 #elif CV_NEON
00806                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00807                 for( ; i <= width-4; i+=4 )
00808                 {
00809                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00810 
00811                     int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00812                     vst1q_s32(D + i, v_s0d);
00813 
00814                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00815                 }
00816                 #endif
00817                 for( ; i < width; i++ )
00818                 {
00819                     int s0 = SUM[i] + Sp[i];
00820                     D[i] = saturate_cast<int>(s0*_scale);
00821                     SUM[i] = s0 - Sm[i];
00822                 }
00823             }
00824             else
00825             {
00826                 i = 0;
00827                 #if CV_SSE2
00828                 if(haveSSE2)
00829                 {
00830                     for( ; i <= width-4; i+=4 )
00831                     {
00832                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
00833                         __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00834                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
00835 
00836                         _mm_storeu_si128((__m128i*)(D+i), _s0);
00837                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00838                     }
00839                 }
00840                 #elif CV_NEON
00841                 for( ; i <= width-4; i+=4 )
00842                 {
00843                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00844 
00845                     vst1q_s32(D + i, v_s0);
00846                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00847                 }
00848                 #endif
00849 
00850                 for( ; i < width; i++ )
00851                 {
00852                     int s0 = SUM[i] + Sp[i];
00853                     D[i] = s0;
00854                     SUM[i] = s0 - Sm[i];
00855                 }
00856             }
00857             dst += dststep;
00858         }
00859     }
00860 
00861     double scale;
00862     int sumCount;
00863     std::vector<int> sum;
00864 };
00865 
00866 
00867 template<>
00868 struct ColumnSum<int, float> :
00869         public BaseColumnFilter
00870 {
00871     ColumnSum( int _ksize, int _anchor, double _scale ) :
00872         BaseColumnFilter()
00873     {
00874         ksize = _ksize;
00875         anchor = _anchor;
00876         scale = _scale;
00877         sumCount = 0;
00878     }
00879 
00880     virtual void reset() { sumCount = 0; }
00881 
00882     virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
00883     {
00884         int i;
00885         int* SUM;
00886         bool haveScale = scale != 1;
00887         double _scale = scale;
00888 
00889         #if CV_SSE2
00890         bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
00891         #endif
00892 
00893         if( width != (int)sum.size() )
00894         {
00895             sum.resize(width);
00896             sumCount = 0;
00897         }
00898 
00899         SUM = &sum[0];
00900         if( sumCount == 0 )
00901         {
00902             memset((void *)SUM, 0, sizeof(int) * width);
00903 
00904             for( ; sumCount < ksize - 1; sumCount++, src++ )
00905             {
00906                 const int* Sp = (const int*)src[0];
00907                 i = 0;
00908 
00909                 #if CV_SSE2
00910                 if(haveSSE2)
00911                 {
00912                     for( ; i < width-4; i+=4 )
00913                     {
00914                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
00915                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
00916                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
00917                     }
00918                 }
00919                 #elif CV_NEON
00920                 for( ; i <= width - 4; i+=4 )
00921                     vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
00922                 #endif
00923 
00924                 for( ; i < width; i++ )
00925                     SUM[i] += Sp[i];
00926             }
00927         }
00928         else
00929         {
00930             CV_Assert( sumCount == ksize-1 );
00931             src += ksize-1;
00932         }
00933 
00934         for( ; count--; src++ )
00935         {
00936             const int * Sp = (const int*)src[0];
00937             const int * Sm = (const int*)src[1-ksize];
00938             float* D = (float*)dst;
00939             if( haveScale )
00940             {
00941                 i = 0;
00942 
00943                 #if CV_SSE2
00944                 if(haveSSE2)
00945                 {
00946                     const __m128 scale4 = _mm_set1_ps((float)_scale);
00947 
00948                     for( ; i < width-4; i+=4)
00949                     {
00950                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00951                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00952                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00953 
00954                         _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
00955                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00956                     }
00957                 }
00958                 #elif CV_NEON
00959                 float32x4_t v_scale = vdupq_n_f32((float)_scale);
00960                 for( ; i <= width-8; i+=8 )
00961                 {
00962                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
00963                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
00964 
00965                     vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
00966                     vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
00967 
00968                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
00969                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
00970                 }
00971                 #endif
00972 
00973                 for( ; i < width; i++ )
00974                 {
00975                     int s0 = SUM[i] + Sp[i];
00976                     D[i] = (float)(s0*_scale);
00977                     SUM[i] = s0 - Sm[i];
00978                 }
00979             }
00980             else
00981             {
00982                 i = 0;
00983 
00984                 #if CV_SSE2
00985                 if(haveSSE2)
00986                 {
00987                     for( ; i < width-4; i+=4)
00988                     {
00989                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
00990                         __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
00991                                                       _mm_loadu_si128((const __m128i*)(Sp+i)));
00992 
00993                         _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
00994                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
00995                     }
00996                 }
00997                 #elif CV_NEON
00998                 for( ; i <= width-8; i+=8 )
00999                 {
01000                     int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
01001                     int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
01002 
01003                     vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
01004                     vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
01005 
01006                     vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
01007                     vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
01008                 }
01009                 #endif
01010 
01011                 for( ; i < width; i++ )
01012                 {
01013                     int s0 = SUM[i] + Sp[i];
01014                     D[i] = (float)(s0);
01015                     SUM[i] = s0 - Sm[i];
01016                 }
01017             }
01018             dst += dststep;
01019         }
01020     }
01021 
01022     double scale;
01023     int sumCount;
01024     std::vector<int> sum;
01025 };
01026 
01027 #ifdef HAVE_OPENCL
01028 
01029 #define DIVUP(total, grain) ((total + grain - 1) / (grain))
01030 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
01031 
01032 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
01033                            Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
01034 {
01035     const ocl::Device & dev = ocl::Device::getDefault();
01036     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
01037     bool doubleSupport = dev.doubleFPConfig() > 0;
01038 
01039     if (ddepth < 0)
01040         ddepth = sdepth;
01041 
01042     if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
01043         _src.offset() % esz != 0 || _src.step() % esz != 0)
01044         return false;
01045 
01046     if (anchor.x < 0)
01047         anchor.x = ksize.width / 2;
01048     if (anchor.y < 0)
01049         anchor.y = ksize.height / 2;
01050 
01051     int computeUnits = ocl::Device::getDefault().maxComputeUnits();
01052     float alpha = 1.0f / (ksize.height * ksize.width);
01053     Size size = _src.size(), wholeSize;
01054     bool isolated = (borderType & BORDER_ISOLATED) != 0;
01055     borderType &= ~BORDER_ISOLATED;
01056     int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
01057         wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
01058 
01059     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
01060     size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
01061     size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
01062 
01063     UMat src = _src.getUMat();
01064     if (!isolated)
01065     {
01066         Point ofs;
01067         src.locateROI(wholeSize, ofs);
01068     }
01069 
01070     int h = isolated ? size.height : wholeSize.height;
01071     int w = isolated ? size.width : wholeSize.width;
01072 
01073     size_t maxWorkItemSizes[32];
01074     ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
01075     int tryWorkItems = (int)maxWorkItemSizes[0];
01076 
01077     ocl::Kernel kernel;
01078 
01079     if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
01080         ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
01081          (ksize.width == 5 && ksize.height == 5 && cn == 1)))
01082     {
01083         if (w < ksize.width || h < ksize.height)
01084             return false;
01085 
01086         // Figure out what vector size to use for loading the pixels.
01087         int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
01088         int pxLoadVecSize = cn * pxLoadNumPixels;
01089 
01090         // Figure out how many pixels per work item to compute in X and Y
01091         // directions.  Too many and we run out of registers.
01092         int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
01093         if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
01094         {
01095             pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
01096             pxPerWorkItemY = size.height % 2 ? 1 : 2;
01097         }
01098         else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
01099         {
01100             pxPerWorkItemX = size.width % 2 ? 1 : 2;
01101             pxPerWorkItemY = size.height % 2 ? 1 : 2;
01102         }
01103         globalsize[0] = size.width / pxPerWorkItemX;
01104         globalsize[1] = size.height / pxPerWorkItemY;
01105 
01106         // Need some padding in the private array for pixels
01107         int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
01108 
01109         // Make the global size a nice round number so the runtime can pick
01110         // from reasonable choices for the workgroup size
01111         const int wgRound = 256;
01112         globalsize[0] = ROUNDUP(globalsize[0], wgRound);
01113 
01114         char build_options[1024], cvt[2][40];
01115         sprintf(build_options, "-D cn=%d "
01116                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
01117                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
01118                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
01119                 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
01120                 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
01121                 "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
01122                 cn, anchor.x, anchor.y, ksize.width, ksize.height,
01123                 pxLoadVecSize, pxLoadNumPixels,
01124                 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
01125                 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
01126                 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
01127                 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
01128                 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
01129                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
01130                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
01131                 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
01132                 ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
01133                 );
01134 
01135 
01136         if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
01137             return false;
01138     }
01139     else
01140     {
01141         localsize = localsize_general;
01142         for ( ; ; )
01143         {
01144             int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
01145 
01146             while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
01147                 BLOCK_SIZE_X /= 2;
01148             while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
01149                 BLOCK_SIZE_Y *= 2;
01150 
01151             if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
01152                 return false;
01153 
01154             char cvt[2][50];
01155             String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
01156                                  " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
01157                                  " -D ST1=%s -D DT1=%s -D cn=%d",
01158                                  BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
01159                                  ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
01160                                  ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
01161                                  ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
01162                                  anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
01163                                  isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
01164                                  normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
01165                                  ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
01166 
01167             localsize[0] = BLOCK_SIZE_X;
01168             globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
01169             globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
01170 
01171             kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
01172             if (kernel.empty())
01173                 return false;
01174 
01175             size_t kernelWorkGroupSize = kernel.workGroupSize();
01176             if (localsize[0] <= kernelWorkGroupSize)
01177                 break;
01178             if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
01179                 return false;
01180 
01181             tryWorkItems = (int)kernelWorkGroupSize;
01182         }
01183     }
01184 
01185     _dst.create(size, CV_MAKETYPE(ddepth, cn));
01186     UMat dst = _dst.getUMat();
01187 
01188     int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
01189     idxArg = kernel.set(idxArg, (int)src.step);
01190     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
01191     int srcOffsetY = (int)(src.offset / src.step);
01192     int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
01193     int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
01194     idxArg = kernel.set(idxArg, srcOffsetX);
01195     idxArg = kernel.set(idxArg, srcOffsetY);
01196     idxArg = kernel.set(idxArg, srcEndX);
01197     idxArg = kernel.set(idxArg, srcEndY);
01198     idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
01199     if (normalize)
01200         idxArg = kernel.set(idxArg, (float)alpha);
01201 
01202     return kernel.run(2, globalsize, localsize, false);
01203 }
01204 
01205 #undef ROUNDUP
01206 
01207 #endif
01208 
01209 }
01210 
01211 
01212 cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
01213 {
01214     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
01215     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
01216 
01217     if( anchor < 0 )
01218         anchor = ksize/2;
01219 
01220     if( sdepth == CV_8U && ddepth == CV_32S )
01221         return makePtr<RowSum<uchar, int> >(ksize, anchor);
01222     if( sdepth == CV_8U && ddepth == CV_64F )
01223         return makePtr<RowSum<uchar, double> >(ksize, anchor);
01224     if( sdepth == CV_16U && ddepth == CV_32S )
01225         return makePtr<RowSum<ushort, int> >(ksize, anchor);
01226     if( sdepth == CV_16U && ddepth == CV_64F )
01227         return makePtr<RowSum<ushort, double> >(ksize, anchor);
01228     if( sdepth == CV_16S && ddepth == CV_32S )
01229         return makePtr<RowSum<short, int> >(ksize, anchor);
01230     if( sdepth == CV_32S && ddepth == CV_32S )
01231         return makePtr<RowSum<int, int> >(ksize, anchor);
01232     if( sdepth == CV_16S && ddepth == CV_64F )
01233         return makePtr<RowSum<short, double> >(ksize, anchor);
01234     if( sdepth == CV_32F && ddepth == CV_64F )
01235         return makePtr<RowSum<float, double> >(ksize, anchor);
01236     if( sdepth == CV_64F && ddepth == CV_64F )
01237         return makePtr<RowSum<double, double> >(ksize, anchor);
01238 
01239     CV_Error_( CV_StsNotImplemented,
01240         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
01241         srcType, sumType));
01242 
01243     return Ptr<BaseRowFilter> ();
01244 }
01245 
01246 
01247 cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
01248                                                      int anchor, double scale)
01249 {
01250     int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
01251     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
01252 
01253     if( anchor < 0 )
01254         anchor = ksize/2;
01255 
01256     if( ddepth == CV_8U && sdepth == CV_32S )
01257         return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
01258     if( ddepth == CV_8U && sdepth == CV_64F )
01259         return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
01260     if( ddepth == CV_16U && sdepth == CV_32S )
01261         return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
01262     if( ddepth == CV_16U && sdepth == CV_64F )
01263         return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
01264     if( ddepth == CV_16S && sdepth == CV_32S )
01265         return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
01266     if( ddepth == CV_16S && sdepth == CV_64F )
01267         return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
01268     if( ddepth == CV_32S && sdepth == CV_32S )
01269         return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
01270     if( ddepth == CV_32F && sdepth == CV_32S )
01271         return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
01272     if( ddepth == CV_32F && sdepth == CV_64F )
01273         return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
01274     if( ddepth == CV_64F && sdepth == CV_32S )
01275         return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
01276     if( ddepth == CV_64F && sdepth == CV_64F )
01277         return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
01278 
01279     CV_Error_( CV_StsNotImplemented,
01280         ("Unsupported combination of sum format (=%d), and destination format (=%d)",
01281         sumType, dstType));
01282 
01283     return Ptr<BaseColumnFilter> ();
01284 }
01285 
01286 
01287 cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
01288                     Point anchor, bool normalize, int borderType )
01289 {
01290     int sdepth = CV_MAT_DEPTH(srcType);
01291     int cn = CV_MAT_CN(srcType), sumType = CV_64F;
01292     if( sdepth <= CV_32S && (!normalize ||
01293         ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
01294             sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
01295         sumType = CV_32S;
01296     sumType = CV_MAKETYPE( sumType, cn );
01297 
01298     Ptr<BaseRowFilter>  rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
01299     Ptr<BaseColumnFilter>  columnFilter = getColumnSumFilter(sumType,
01300         dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
01301 
01302     return makePtr<FilterEngine>(Ptr<BaseFilter> (), rowFilter, columnFilter,
01303            srcType, dstType, sumType, borderType );
01304 }
01305 
01306 #if defined(HAVE_IPP)
01307 namespace cv
01308 {
01309 static bool ipp_boxfilter( InputArray _src, OutputArray _dst, int ddepth,
01310                 Size ksize, Point anchor,
01311                 bool normalize, int borderType )
01312 {
01313     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
01314     if( ddepth < 0 )
01315         ddepth = sdepth;
01316     int ippBorderType = borderType & ~BORDER_ISOLATED;
01317     Point ocvAnchor, ippAnchor;
01318     ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
01319     ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
01320     ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
01321     ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
01322 
01323     Mat src = _src.getMat();
01324     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
01325     Mat dst = _dst.getMat();
01326     if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
01327     {
01328         if( src.rows == 1 )
01329             ksize.height = 1;
01330         if( src.cols == 1 )
01331             ksize.width = 1;
01332     }
01333 
01334     {
01335         if (normalize && !src.isSubmatrix() && ddepth == sdepth &&
01336             (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
01337              ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
01338              dst.cols != ksize.width && dst.rows != ksize.height) // returns ippStsMaskSizeErr: mask has an illegal value
01339         {
01340             Ipp32s bufSize = 0;
01341             IppiSize roiSize = { dst.cols, dst.rows }, maskSize = { ksize.width, ksize.height };
01342 
01343 #define IPP_FILTER_BOX_BORDER(ippType, ippDataType, flavor) \
01344             do \
01345             { \
01346                 if (ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ippDataType, cn, &bufSize) >= 0) \
01347                 { \
01348                     Ipp8u * buffer = ippsMalloc_8u(bufSize); \
01349                     ippType borderValue[4] = { 0, 0, 0, 0 }; \
01350                     ippBorderType = ippBorderType == BORDER_CONSTANT ? ippBorderConst : ippBorderRepl; \
01351                     IppStatus status = ippiFilterBoxBorder_##flavor(src.ptr<ippType>(), (int)src.step, dst.ptr<ippType>(), \
01352                                                                     (int)dst.step, roiSize, maskSize, \
01353                                                                     (IppiBorderType)ippBorderType, borderValue, buffer); \
01354                     ippsFree(buffer); \
01355                     if (status >= 0) \
01356                     { \
01357                         CV_IMPL_ADD(CV_IMPL_IPP); \
01358                         return true; \
01359                     } \
01360                 } \
01361             } while ((void)0, 0)
01362 
01363             if (stype == CV_8UC1)
01364                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C1R);
01365             else if (stype == CV_8UC3)
01366                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C3R);
01367             else if (stype == CV_8UC4)
01368                 IPP_FILTER_BOX_BORDER(Ipp8u, ipp8u, 8u_C4R);
01369 
01370             // Oct 2014: performance with BORDER_CONSTANT
01371             //else if (stype == CV_16UC1)
01372             //    IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C1R);
01373             else if (stype == CV_16UC3)
01374                 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C3R);
01375             else if (stype == CV_16UC4)
01376                 IPP_FILTER_BOX_BORDER(Ipp16u, ipp16u, 16u_C4R);
01377 
01378             // Oct 2014: performance with BORDER_CONSTANT
01379             //else if (stype == CV_16SC1)
01380             //    IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C1R);
01381             else if (stype == CV_16SC3)
01382                 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C3R);
01383             else if (stype == CV_16SC4)
01384                 IPP_FILTER_BOX_BORDER(Ipp16s, ipp16s, 16s_C4R);
01385 
01386             else if (stype == CV_32FC1)
01387                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C1R);
01388             else if (stype == CV_32FC3)
01389                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C3R);
01390             else if (stype == CV_32FC4)
01391                 IPP_FILTER_BOX_BORDER(Ipp32f, ipp32f, 32f_C4R);
01392         }
01393 #undef IPP_FILTER_BOX_BORDER
01394     }
01395     return false;
01396 }
01397 }
01398 #endif
01399 
01400 
01401 void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
01402                 Size ksize, Point anchor,
01403                 bool normalize, int borderType )
01404 {
01405 #ifdef HAVE_OPENCL
01406     CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
01407 #endif
01408 
01409     Mat src = _src.getMat();
01410     int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
01411     if( ddepth < 0 )
01412         ddepth = sdepth;
01413     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
01414     Mat dst = _dst.getMat();
01415     if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
01416     {
01417         if( src.rows == 1 )
01418             ksize.height = 1;
01419         if( src.cols == 1 )
01420             ksize.width = 1;
01421     }
01422 #ifdef HAVE_TEGRA_OPTIMIZATION
01423     if ( tegra::useTegra() && tegra::box(src, dst, ksize, anchor, normalize, borderType) )
01424         return;
01425 #endif
01426 
01427 #ifdef HAVE_IPP
01428     int ippBorderType = borderType & ~BORDER_ISOLATED;
01429 #endif
01430     Point ocvAnchor, ippAnchor;
01431     ocvAnchor.x = anchor.x < 0 ? ksize.width / 2 : anchor.x;
01432     ocvAnchor.y = anchor.y < 0 ? ksize.height / 2 : anchor.y;
01433     ippAnchor.x = ksize.width / 2 - (ksize.width % 2 == 0 ? 1 : 0);
01434     ippAnchor.y = ksize.height / 2 - (ksize.height % 2 == 0 ? 1 : 0);
01435     CV_IPP_RUN((normalize && !_src.isSubmatrix() && ddepth == sdepth &&
01436             (/*ippBorderType == BORDER_REPLICATE ||*/ /* returns ippStsStepErr: Step value is not valid */
01437              ippBorderType == BORDER_CONSTANT) && ocvAnchor == ippAnchor &&
01438              _dst.cols() != ksize.width && _dst.rows() != ksize.height),
01439              ipp_boxfilter( _src,  _dst,  ddepth, ksize,  anchor, normalize,  borderType));
01440 
01441 
01442     Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
01443                         ksize, anchor, normalize, borderType );
01444     f->apply( src, dst );
01445 }
01446 
01447 
01448 void cv::blur( InputArray src, OutputArray dst,
01449            Size ksize, Point anchor, int borderType )
01450 {
01451     boxFilter( src, dst, -1, ksize, anchor, true, borderType );
01452 }
01453 
01454 
01455 /****************************************************************************************\
01456                                     Squared Box Filter
01457 \****************************************************************************************/
01458 
01459 namespace cv
01460 {
01461 
01462 template<typename T, typename ST>
01463 struct SqrRowSum :
01464         public BaseRowFilter
01465 {
01466     SqrRowSum( int _ksize, int _anchor ) :
01467         BaseRowFilter()
01468     {
01469         ksize = _ksize;
01470         anchor = _anchor;
01471     }
01472 
01473     virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
01474     {
01475         const T* S = (const T*)src;
01476         ST* D = (ST*)dst;
01477         int i = 0, k, ksz_cn = ksize*cn;
01478 
01479         width = (width - 1)*cn;
01480         for( k = 0; k < cn; k++, S++, D++ )
01481         {
01482             ST s = 0;
01483             for( i = 0; i < ksz_cn; i += cn )
01484             {
01485                 ST val = (ST)S[i];
01486                 s += val*val;
01487             }
01488             D[0] = s;
01489             for( i = 0; i < width; i += cn )
01490             {
01491                 ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
01492                 s += val1*val1 - val0*val0;
01493                 D[i+cn] = s;
01494             }
01495         }
01496     }
01497 };
01498 
01499 static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
01500 {
01501     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
01502     CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
01503 
01504     if( anchor < 0 )
01505         anchor = ksize/2;
01506 
01507     if( sdepth == CV_8U && ddepth == CV_32S )
01508         return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
01509     if( sdepth == CV_8U && ddepth == CV_64F )
01510         return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
01511     if( sdepth == CV_16U && ddepth == CV_64F )
01512         return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
01513     if( sdepth == CV_16S && ddepth == CV_64F )
01514         return makePtr<SqrRowSum<short, double> >(ksize, anchor);
01515     if( sdepth == CV_32F && ddepth == CV_64F )
01516         return makePtr<SqrRowSum<float, double> >(ksize, anchor);
01517     if( sdepth == CV_64F && ddepth == CV_64F )
01518         return makePtr<SqrRowSum<double, double> >(ksize, anchor);
01519 
01520     CV_Error_( CV_StsNotImplemented,
01521               ("Unsupported combination of source format (=%d), and buffer format (=%d)",
01522                srcType, sumType));
01523 
01524     return Ptr<BaseRowFilter>();
01525 }
01526 
01527 }
01528 
01529 void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
01530                        Size ksize, Point anchor,
01531                        bool normalize, int borderType )
01532 {
01533     int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
01534     Size size = _src.size();
01535 
01536     if( ddepth < 0 )
01537         ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
01538 
01539     if( borderType != BORDER_CONSTANT && normalize )
01540     {
01541         if( size.height == 1 )
01542             ksize.height = 1;
01543         if( size.width == 1 )
01544             ksize.width = 1;
01545     }
01546 
01547 #ifdef HAVE_OPENCL
01548     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
01549                ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
01550 #endif
01551 
01552     int sumDepth = CV_64F;
01553     if( sdepth == CV_8U )
01554         sumDepth = CV_32S;
01555     int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
01556 
01557     Mat src = _src.getMat();
01558     _dst.create( size, dstType );
01559     Mat dst = _dst.getMat();
01560 
01561     Ptr<BaseRowFilter>  rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
01562     Ptr<BaseColumnFilter>  columnFilter = getColumnSumFilter(sumType,
01563                                                             dstType, ksize.height, anchor.y,
01564                                                             normalize ? 1./(ksize.width*ksize.height) : 1);
01565 
01566     Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter> (), rowFilter, columnFilter,
01567                                                 srcType, dstType, sumType, borderType );
01568     f->apply( src, dst );
01569 }
01570 
01571 
01572 /****************************************************************************************\
01573                                      Gaussian Blur
01574 \****************************************************************************************/
01575 
01576 cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype )
01577 {
01578     const int SMALL_GAUSSIAN_SIZE = 7;
01579     static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] =
01580     {
01581         {1.f},
01582         {0.25f, 0.5f, 0.25f},
01583         {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f},
01584         {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f}
01585     };
01586 
01587     const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ?
01588         small_gaussian_tab[n>>1] : 0;
01589 
01590     CV_Assert( ktype == CV_32F || ktype == CV_64F );
01591     Mat kernel(n, 1, ktype);
01592     float* cf = kernel.ptr<float>();
01593     double* cd = kernel.ptr<double>();
01594 
01595     double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8;
01596     double scale2X = -0.5/(sigmaX*sigmaX);
01597     double sum = 0;
01598 
01599     int i;
01600     for( i = 0; i < n; i++ )
01601     {
01602         double x = i - (n-1)*0.5;
01603         double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x);
01604         if( ktype == CV_32F )
01605         {
01606             cf[i] = (float)t;
01607             sum += cf[i];
01608         }
01609         else
01610         {
01611             cd[i] = t;
01612             sum += cd[i];
01613         }
01614     }
01615 
01616     sum = 1./sum;
01617     for( i = 0; i < n; i++ )
01618     {
01619         if( ktype == CV_32F )
01620             cf[i] = (float)(cf[i]*sum);
01621         else
01622             cd[i] *= sum;
01623     }
01624 
01625     return kernel;
01626 }
01627 
01628 namespace cv {
01629 
01630 static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize,
01631                                    double sigma1, double sigma2 )
01632 {
01633     int depth = CV_MAT_DEPTH(type);
01634     if( sigma2 <= 0 )
01635         sigma2 = sigma1;
01636 
01637     // automatic detection of kernel size from sigma
01638     if( ksize.width <= 0 && sigma1 > 0 )
01639         ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
01640     if( ksize.height <= 0 && sigma2 > 0 )
01641         ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1;
01642 
01643     CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 &&
01644         ksize.height > 0 && ksize.height % 2 == 1 );
01645 
01646     sigma1 = std::max( sigma1, 0. );
01647     sigma2 = std::max( sigma2, 0. );
01648 
01649     kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
01650     if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
01651         ky = kx;
01652     else
01653         ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
01654 }
01655 
01656 }
01657 
01658 cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize,
01659                                         double sigma1, double sigma2,
01660                                         int borderType )
01661 {
01662     Mat kx, ky;
01663     createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
01664 
01665     return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType );
01666 }
01667 
01668 #ifdef HAVE_IPP
01669 namespace cv
01670 {
01671 static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
01672                    double sigma1, double sigma2,
01673                    int borderType )
01674 {
01675 #if IPP_VERSION_X100 >= 810
01676     int type = _src.type();
01677     Size size = _src.size();
01678 
01679     if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
01680     {
01681         if( size.height == 1 )
01682             ksize.height = 1;
01683         if( size.width == 1 )
01684             ksize.width = 1;
01685     }
01686 
01687     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
01688 
01689     if ((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn == 1 || cn == 3) &&
01690             sigma1 == sigma2 && ksize.width == ksize.height && sigma1 != 0.0 )
01691     {
01692         IppiBorderType ippBorder = ippiGetBorderType(borderType);
01693         if (ippBorderConst == ippBorder || ippBorderRepl == ippBorder)
01694         {
01695             Mat src = _src.getMat(), dst = _dst.getMat();
01696             IppiSize roiSize = { src.cols, src.rows };
01697             IppDataType dataType = ippiGetDataType(depth);
01698             Ipp32s specSize = 0, bufferSize = 0;
01699 
01700             if (ippiFilterGaussianGetBufferSize(roiSize, (Ipp32u)ksize.width, dataType, cn, &specSize, &bufferSize) >= 0)
01701             {
01702                 IppAutoBuffer<IppFilterGaussianSpec> spec(specSize);
01703                 IppAutoBuffer<Ipp8u> buffer(bufferSize);
01704 
01705                 if (ippiFilterGaussianInit(roiSize, (Ipp32u)ksize.width, (Ipp32f)sigma1, ippBorder, dataType, cn, spec, buffer) >= 0)
01706                 {
01707 #define IPP_FILTER_GAUSS_C1(ippfavor) \
01708                     { \
01709                         Ipp##ippfavor borderValues = 0; \
01710                         status = ippiFilterGaussianBorder_##ippfavor##_C1R(src.ptr<Ipp##ippfavor>(), (int)src.step, \
01711                                 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
01712                     }
01713 
01714 #define IPP_FILTER_GAUSS_CN(ippfavor, ippcn) \
01715                     { \
01716                         Ipp##ippfavor borderValues[] = { 0, 0, 0 }; \
01717                         status = ippiFilterGaussianBorder_##ippfavor##_C##ippcn##R(src.ptr<Ipp##ippfavor>(), (int)src.step, \
01718                                 dst.ptr<Ipp##ippfavor>(), (int)dst.step, roiSize, borderValues, spec, buffer); \
01719                     }
01720 
01721                     IppStatus status = ippStsErr;
01722 #if !HAVE_ICV
01723 #if IPP_VERSION_X100 > 900 // Buffer overflow may happen in IPP 9.0.0 and less
01724                     if (type == CV_8UC1)
01725                         IPP_FILTER_GAUSS_C1(8u)
01726                     else
01727 #endif
01728                     if (type == CV_8UC3)
01729                         IPP_FILTER_GAUSS_CN(8u, 3)
01730                     else if (type == CV_16UC1)
01731                         IPP_FILTER_GAUSS_C1(16u)
01732                     else if (type == CV_16UC3)
01733                         IPP_FILTER_GAUSS_CN(16u, 3)
01734                     else if (type == CV_16SC1)
01735                         IPP_FILTER_GAUSS_C1(16s)
01736                     else if (type == CV_16SC3)
01737                         IPP_FILTER_GAUSS_CN(16s, 3)
01738                     else if (type == CV_32FC3)
01739                         IPP_FILTER_GAUSS_CN(32f, 3)
01740                     else
01741 #endif
01742                     if (type == CV_32FC1)
01743                         IPP_FILTER_GAUSS_C1(32f)
01744 
01745                     if(status >= 0)
01746                         return true;
01747 
01748 #undef IPP_FILTER_GAUSS_C1
01749 #undef IPP_FILTER_GAUSS_CN
01750                 }
01751             }
01752         }
01753     }
01754 #else
01755     CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType);
01756 #endif
01757     return false;
01758 }
01759 }
01760 #endif
01761 
01762 
01763 void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
01764                    double sigma1, double sigma2,
01765                    int borderType )
01766 {
01767     int type = _src.type();
01768     Size size = _src.size();
01769     _dst.create( size, type );
01770 
01771     if( borderType != BORDER_CONSTANT && (borderType & BORDER_ISOLATED) != 0 )
01772     {
01773         if( size.height == 1 )
01774             ksize.height = 1;
01775         if( size.width == 1 )
01776             ksize.width = 1;
01777     }
01778 
01779     if( ksize.width == 1 && ksize.height == 1 )
01780     {
01781         _src.copyTo(_dst);
01782         return;
01783     }
01784 
01785 #ifdef HAVE_TEGRA_OPTIMIZATION
01786     Mat src = _src.getMat();
01787     Mat dst = _dst.getMat();
01788     if(sigma1 == 0 && sigma2 == 0 && tegra::useTegra() && tegra::gaussian(src, dst, ksize, borderType))
01789         return;
01790 #endif
01791 
01792     CV_IPP_RUN(true, ipp_GaussianBlur( _src,  _dst,  ksize, sigma1,  sigma2, borderType));
01793 
01794     Mat kx, ky;
01795     createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
01796     sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
01797 }
01798 
01799 /****************************************************************************************\
01800                                       Median Filter
01801 \****************************************************************************************/
01802 
01803 namespace cv
01804 {
01805 typedef ushort HT;
01806 
01807 /**
01808  * This structure represents a two-tier histogram. The first tier (known as the
01809  * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level)
01810  * is 8 bit wide. Pixels inserted in the fine level also get inserted into the
01811  * coarse bucket designated by the 4 MSBs of the fine bucket value.
01812  *
01813  * The structure is aligned on 16 bits, which is a prerequisite for SIMD
01814  * instructions. Each bucket is 16 bit wide, which means that extra care must be
01815  * taken to prevent overflow.
01816  */
01817 typedef struct
01818 {
01819     HT coarse[16];
01820     HT fine[16][16];
01821 } Histogram;
01822 
01823 
01824 #if CV_SSE2
01825 #define MEDIAN_HAVE_SIMD 1
01826 
01827 static inline void histogram_add_simd( const HT x[16], HT y[16] )
01828 {
01829     const __m128i* rx = (const __m128i*)x;
01830     __m128i* ry = (__m128i*)y;
01831     __m128i r0 = _mm_add_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
01832     __m128i r1 = _mm_add_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
01833     _mm_store_si128(ry+0, r0);
01834     _mm_store_si128(ry+1, r1);
01835 }
01836 
01837 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
01838 {
01839     const __m128i* rx = (const __m128i*)x;
01840     __m128i* ry = (__m128i*)y;
01841     __m128i r0 = _mm_sub_epi16(_mm_load_si128(ry+0),_mm_load_si128(rx+0));
01842     __m128i r1 = _mm_sub_epi16(_mm_load_si128(ry+1),_mm_load_si128(rx+1));
01843     _mm_store_si128(ry+0, r0);
01844     _mm_store_si128(ry+1, r1);
01845 }
01846 
01847 #elif CV_NEON
01848 #define MEDIAN_HAVE_SIMD 1
01849 
01850 static inline void histogram_add_simd( const HT x[16], HT y[16] )
01851 {
01852     vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y)));
01853     vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
01854 }
01855 
01856 static inline void histogram_sub_simd( const HT x[16], HT y[16] )
01857 {
01858     vst1q_u16(y, vsubq_u16(vld1q_u16(y), vld1q_u16(x)));
01859     vst1q_u16(y + 8, vsubq_u16(vld1q_u16(y + 8), vld1q_u16(x + 8)));
01860 }
01861 
01862 #else
01863 #define MEDIAN_HAVE_SIMD 0
01864 #endif
01865 
01866 
01867 static inline void histogram_add( const HT x[16], HT y[16] )
01868 {
01869     int i;
01870     for( i = 0; i < 16; ++i )
01871         y[i] = (HT)(y[i] + x[i]);
01872 }
01873 
01874 static inline void histogram_sub( const HT x[16], HT y[16] )
01875 {
01876     int i;
01877     for( i = 0; i < 16; ++i )
01878         y[i] = (HT)(y[i] - x[i]);
01879 }
01880 
01881 static inline void histogram_muladd( int a, const HT x[16],
01882         HT y[16] )
01883 {
01884     for( int i = 0; i < 16; ++i )
01885         y[i] = (HT)(y[i] + a * x[i]);
01886 }
01887 
01888 static void
01889 medianBlur_8u_O1 ( const Mat& _src, Mat& _dst, int ksize )
01890 {
01891 /**
01892  * HOP is short for Histogram OPeration. This macro makes an operation \a op on
01893  * histogram \a h for pixel value \a x. It takes care of handling both levels.
01894  */
01895 #define HOP(h,x,op) \
01896     h.coarse[x>>4] op, \
01897     *((HT*)h.fine + x) op
01898 
01899 #define COP(c,j,x,op) \
01900     h_coarse[ 16*(n*c+j) + (x>>4) ] op, \
01901     h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op
01902 
01903     int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
01904     size_t sstep = _src.step, dstep = _dst.step;
01905     Histogram CV_DECL_ALIGNED(16) H[4];
01906     HT CV_DECL_ALIGNED(16) luc[4][16];
01907 
01908     int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
01909 
01910     std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
01911     std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
01912     HT* h_coarse = alignPtr(&_h_coarse[0], 16);
01913     HT* h_fine = alignPtr(&_h_fine[0], 16);
01914 #if MEDIAN_HAVE_SIMD
01915     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
01916 #endif
01917 
01918     for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
01919     {
01920         int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2;
01921         const uchar* src = _src.ptr() + x*cn;
01922         uchar* dst = _dst.ptr() + (x - r)*cn;
01923 
01924         memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) );
01925         memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) );
01926 
01927         // First row initialization
01928         for( c = 0; c < cn; c++ )
01929         {
01930             for( j = 0; j < n; j++ )
01931                 COP( c, j, src[cn*j+c], += (cv::HT)(r+2) );
01932 
01933             for( i = 1; i < r; i++ )
01934             {
01935                 const uchar* p = src + sstep*std::min(i, m-1);
01936                 for ( j = 0; j < n; j++ )
01937                     COP( c, j, p[cn*j+c], ++ );
01938             }
01939         }
01940 
01941         for( i = 0; i < m; i++ )
01942         {
01943             const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
01944             const uchar* p1 = src + sstep * std::min( m-1, i+r );
01945 
01946             memset( H, 0, cn*sizeof(H[0]) );
01947             memset( luc, 0, cn*sizeof(luc[0]) );
01948             for( c = 0; c < cn; c++ )
01949             {
01950                 // Update column histograms for the entire row.
01951                 for( j = 0; j < n; j++ )
01952                 {
01953                     COP( c, j, p0[j*cn + c], -- );
01954                     COP( c, j, p1[j*cn + c], ++ );
01955                 }
01956 
01957                 // First column initialization
01958                 for( k = 0; k < 16; ++k )
01959                     histogram_muladd( 2*r+1, &h_fine[16*n*(16*c+k)], &H[c].fine[k][0] );
01960 
01961             #if MEDIAN_HAVE_SIMD
01962                 if( useSIMD )
01963                 {
01964                     for( j = 0; j < 2*r; ++j )
01965                         histogram_add_simd( &h_coarse[16*(n*c+j)], H[c].coarse );
01966 
01967                     for( j = r; j < n-r; j++ )
01968                     {
01969                         int t = 2*r*r + 2*r, b, sum = 0;
01970                         HT* segment;
01971 
01972                         histogram_add_simd( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
01973 
01974                         // Find median at coarse level
01975                         for ( k = 0; k < 16 ; ++k )
01976                         {
01977                             sum += H[c].coarse[k];
01978                             if ( sum > t )
01979                             {
01980                                 sum -= H[c].coarse[k];
01981                                 break;
01982                             }
01983                         }
01984                         assert( k < 16 );
01985 
01986                         /* Update corresponding histogram segment */
01987                         if ( luc[c][k] <= j-r )
01988                         {
01989                             memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
01990                             for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
01991                                 histogram_add_simd( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
01992 
01993                             if ( luc[c][k] < j+r+1 )
01994                             {
01995                                 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
01996                                 luc[c][k] = (HT)(j+r+1);
01997                             }
01998                         }
01999                         else
02000                         {
02001                             for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
02002                             {
02003                                 histogram_sub_simd( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
02004                                 histogram_add_simd( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
02005                             }
02006                         }
02007 
02008                         histogram_sub_simd( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
02009 
02010                         /* Find median in segment */
02011                         segment = H[c].fine[k];
02012                         for ( b = 0; b < 16 ; b++ )
02013                         {
02014                             sum += segment[b];
02015                             if ( sum > t )
02016                             {
02017                                 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
02018                                 break;
02019                             }
02020                         }
02021                         assert( b < 16 );
02022                     }
02023                 }
02024                 else
02025             #endif
02026                 {
02027                     for( j = 0; j < 2*r; ++j )
02028                         histogram_add( &h_coarse[16*(n*c+j)], H[c].coarse );
02029 
02030                     for( j = r; j < n-r; j++ )
02031                     {
02032                         int t = 2*r*r + 2*r, b, sum = 0;
02033                         HT* segment;
02034 
02035                         histogram_add( &h_coarse[16*(n*c + std::min(j+r,n-1))], H[c].coarse );
02036 
02037                         // Find median at coarse level
02038                         for ( k = 0; k < 16 ; ++k )
02039                         {
02040                             sum += H[c].coarse[k];
02041                             if ( sum > t )
02042                             {
02043                                 sum -= H[c].coarse[k];
02044                                 break;
02045                             }
02046                         }
02047                         assert( k < 16 );
02048 
02049                         /* Update corresponding histogram segment */
02050                         if ( luc[c][k] <= j-r )
02051                         {
02052                             memset( &H[c].fine[k], 0, 16 * sizeof(HT) );
02053                             for ( luc[c][k] = cv::HT(j-r); luc[c][k] < MIN(j+r+1,n); ++luc[c][k] )
02054                                 histogram_add( &h_fine[16*(n*(16*c+k)+luc[c][k])], H[c].fine[k] );
02055 
02056                             if ( luc[c][k] < j+r+1 )
02057                             {
02058                                 histogram_muladd( j+r+1 - n, &h_fine[16*(n*(16*c+k)+(n-1))], &H[c].fine[k][0] );
02059                                 luc[c][k] = (HT)(j+r+1);
02060                             }
02061                         }
02062                         else
02063                         {
02064                             for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
02065                             {
02066                                 histogram_sub( &h_fine[16*(n*(16*c+k)+MAX(luc[c][k]-2*r-1,0))], H[c].fine[k] );
02067                                 histogram_add( &h_fine[16*(n*(16*c+k)+MIN(luc[c][k],n-1))], H[c].fine[k] );
02068                             }
02069                         }
02070 
02071                         histogram_sub( &h_coarse[16*(n*c+MAX(j-r,0))], H[c].coarse );
02072 
02073                         /* Find median in segment */
02074                         segment = H[c].fine[k];
02075                         for ( b = 0; b < 16 ; b++ )
02076                         {
02077                             sum += segment[b];
02078                             if ( sum > t )
02079                             {
02080                                 dst[dstep*i+cn*j+c] = (uchar)(16*k + b);
02081                                 break;
02082                             }
02083                         }
02084                         assert( b < 16 );
02085                     }
02086                 }
02087             }
02088         }
02089     }
02090 
02091 #undef HOP
02092 #undef COP
02093 }
02094 
02095 static void
02096 medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m )
02097 {
02098     #define N  16
02099     int     zone0[4][N];
02100     int     zone1[4][N*N];
02101     int     x, y;
02102     int     n2 = m*m/2;
02103     Size    size = _dst.size();
02104     const uchar* src = _src.ptr();
02105     uchar*  dst = _dst.ptr();
02106     int     src_step = (int)_src.step, dst_step = (int)_dst.step;
02107     int     cn = _src.channels();
02108     const uchar*  src_max = src + size.height*src_step;
02109 
02110     #define UPDATE_ACC01( pix, cn, op ) \
02111     {                                   \
02112         int p = (pix);                  \
02113         zone1[cn][p] op;                \
02114         zone0[cn][p >> 4] op;           \
02115     }
02116 
02117     //CV_Assert( size.height >= nx && size.width >= nx );
02118     for( x = 0; x < size.width; x++, src += cn, dst += cn )
02119     {
02120         uchar* dst_cur = dst;
02121         const uchar* src_top = src;
02122         const uchar* src_bottom = src;
02123         int k, c;
02124         int src_step1 = src_step, dst_step1 = dst_step;
02125 
02126         if( x % 2 != 0 )
02127         {
02128             src_bottom = src_top += src_step*(size.height-1);
02129             dst_cur += dst_step*(size.height-1);
02130             src_step1 = -src_step1;
02131             dst_step1 = -dst_step1;
02132         }
02133 
02134         // init accumulator
02135         memset( zone0, 0, sizeof(zone0[0])*cn );
02136         memset( zone1, 0, sizeof(zone1[0])*cn );
02137 
02138         for( y = 0; y <= m/2; y++ )
02139         {
02140             for( c = 0; c < cn; c++ )
02141             {
02142                 if( y > 0 )
02143                 {
02144                     for( k = 0; k < m*cn; k += cn )
02145                         UPDATE_ACC01( src_bottom[k+c], c, ++ );
02146                 }
02147                 else
02148                 {
02149                     for( k = 0; k < m*cn; k += cn )
02150                         UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 );
02151                 }
02152             }
02153 
02154             if( (src_step1 > 0 && y < size.height-1) ||
02155                 (src_step1 < 0 && size.height-y-1 > 0) )
02156                 src_bottom += src_step1;
02157         }
02158 
02159         for( y = 0; y < size.height; y++, dst_cur += dst_step1 )
02160         {
02161             // find median
02162             for( c = 0; c < cn; c++ )
02163             {
02164                 int s = 0;
02165                 for( k = 0; ; k++ )
02166                 {
02167                     int t = s + zone0[c][k];
02168                     if( t > n2 ) break;
02169                     s = t;
02170                 }
02171 
02172                 for( k *= N; ;k++ )
02173                 {
02174                     s += zone1[c][k];
02175                     if( s > n2 ) break;
02176                 }
02177 
02178                 dst_cur[c] = (uchar)k;
02179             }
02180 
02181             if( y+1 == size.height )
02182                 break;
02183 
02184             if( cn == 1 )
02185             {
02186                 for( k = 0; k < m; k++ )
02187                 {
02188                     int p = src_top[k];
02189                     int q = src_bottom[k];
02190                     zone1[0][p]--;
02191                     zone0[0][p>>4]--;
02192                     zone1[0][q]++;
02193                     zone0[0][q>>4]++;
02194                 }
02195             }
02196             else if( cn == 3 )
02197             {
02198                 for( k = 0; k < m*3; k += 3 )
02199                 {
02200                     UPDATE_ACC01( src_top[k], 0, -- );
02201                     UPDATE_ACC01( src_top[k+1], 1, -- );
02202                     UPDATE_ACC01( src_top[k+2], 2, -- );
02203 
02204                     UPDATE_ACC01( src_bottom[k], 0, ++ );
02205                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
02206                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
02207                 }
02208             }
02209             else
02210             {
02211                 assert( cn == 4 );
02212                 for( k = 0; k < m*4; k += 4 )
02213                 {
02214                     UPDATE_ACC01( src_top[k], 0, -- );
02215                     UPDATE_ACC01( src_top[k+1], 1, -- );
02216                     UPDATE_ACC01( src_top[k+2], 2, -- );
02217                     UPDATE_ACC01( src_top[k+3], 3, -- );
02218 
02219                     UPDATE_ACC01( src_bottom[k], 0, ++ );
02220                     UPDATE_ACC01( src_bottom[k+1], 1, ++ );
02221                     UPDATE_ACC01( src_bottom[k+2], 2, ++ );
02222                     UPDATE_ACC01( src_bottom[k+3], 3, ++ );
02223                 }
02224             }
02225 
02226             if( (src_step1 > 0 && src_bottom + src_step1 < src_max) ||
02227                 (src_step1 < 0 && src_bottom + src_step1 >= src) )
02228                 src_bottom += src_step1;
02229 
02230             if( y >= m/2 )
02231                 src_top += src_step1;
02232         }
02233     }
02234 #undef N
02235 #undef UPDATE_ACC
02236 }
02237 
02238 
02239 struct MinMax8u
02240 {
02241     typedef uchar value_type;
02242     typedef int arg_type;
02243     enum { SIZE = 1 };
02244     arg_type load(const uchar* ptr) { return *ptr; }
02245     void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; }
02246     void operator()(arg_type& a, arg_type& b) const
02247     {
02248         int t = CV_FAST_CAST_8U(a - b);
02249         b += t; a -= t;
02250     }
02251 };
02252 
02253 struct MinMax16u
02254 {
02255     typedef ushort value_type;
02256     typedef int arg_type;
02257     enum { SIZE = 1 };
02258     arg_type load(const ushort* ptr) { return *ptr; }
02259     void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; }
02260     void operator()(arg_type& a, arg_type& b) const
02261     {
02262         arg_type t = a;
02263         a = std::min(a, b);
02264         b = std::max(b, t);
02265     }
02266 };
02267 
02268 struct MinMax16s
02269 {
02270     typedef short value_type;
02271     typedef int arg_type;
02272     enum { SIZE = 1 };
02273     arg_type load(const short* ptr) { return *ptr; }
02274     void store(short* ptr, arg_type val) { *ptr = (short)val; }
02275     void operator()(arg_type& a, arg_type& b) const
02276     {
02277         arg_type t = a;
02278         a = std::min(a, b);
02279         b = std::max(b, t);
02280     }
02281 };
02282 
02283 struct MinMax32f
02284 {
02285     typedef float value_type;
02286     typedef float arg_type;
02287     enum { SIZE = 1 };
02288     arg_type load(const float* ptr) { return *ptr; }
02289     void store(float* ptr, arg_type val) { *ptr = val; }
02290     void operator()(arg_type& a, arg_type& b) const
02291     {
02292         arg_type t = a;
02293         a = std::min(a, b);
02294         b = std::max(b, t);
02295     }
02296 };
02297 
02298 #if CV_SSE2
02299 
02300 struct MinMaxVec8u
02301 {
02302     typedef uchar value_type;
02303     typedef __m128i arg_type;
02304     enum { SIZE = 16 };
02305     arg_type load(const uchar* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
02306     void store(uchar* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
02307     void operator()(arg_type& a, arg_type& b) const
02308     {
02309         arg_type t = a;
02310         a = _mm_min_epu8(a, b);
02311         b = _mm_max_epu8(b, t);
02312     }
02313 };
02314 
02315 
02316 struct MinMaxVec16u
02317 {
02318     typedef ushort value_type;
02319     typedef __m128i arg_type;
02320     enum { SIZE = 8 };
02321     arg_type load(const ushort* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
02322     void store(ushort* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
02323     void operator()(arg_type& a, arg_type& b) const
02324     {
02325         arg_type t = _mm_subs_epu16(a, b);
02326         a = _mm_subs_epu16(a, t);
02327         b = _mm_adds_epu16(b, t);
02328     }
02329 };
02330 
02331 
02332 struct MinMaxVec16s
02333 {
02334     typedef short value_type;
02335     typedef __m128i arg_type;
02336     enum { SIZE = 8 };
02337     arg_type load(const short* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
02338     void store(short* ptr, arg_type val) { _mm_storeu_si128((__m128i*)ptr, val); }
02339     void operator()(arg_type& a, arg_type& b) const
02340     {
02341         arg_type t = a;
02342         a = _mm_min_epi16(a, b);
02343         b = _mm_max_epi16(b, t);
02344     }
02345 };
02346 
02347 
02348 struct MinMaxVec32f
02349 {
02350     typedef float value_type;
02351     typedef __m128 arg_type;
02352     enum { SIZE = 4 };
02353     arg_type load(const float* ptr) { return _mm_loadu_ps(ptr); }
02354     void store(float* ptr, arg_type val) { _mm_storeu_ps(ptr, val); }
02355     void operator()(arg_type& a, arg_type& b) const
02356     {
02357         arg_type t = a;
02358         a = _mm_min_ps(a, b);
02359         b = _mm_max_ps(b, t);
02360     }
02361 };
02362 
02363 #elif CV_NEON
02364 
02365 struct MinMaxVec8u
02366 {
02367     typedef uchar value_type;
02368     typedef uint8x16_t arg_type;
02369     enum { SIZE = 16 };
02370     arg_type load(const uchar* ptr) { return vld1q_u8(ptr); }
02371     void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); }
02372     void operator()(arg_type& a, arg_type& b) const
02373     {
02374         arg_type t = a;
02375         a = vminq_u8(a, b);
02376         b = vmaxq_u8(b, t);
02377     }
02378 };
02379 
02380 
02381 struct MinMaxVec16u
02382 {
02383     typedef ushort value_type;
02384     typedef uint16x8_t arg_type;
02385     enum { SIZE = 8 };
02386     arg_type load(const ushort* ptr) { return vld1q_u16(ptr); }
02387     void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); }
02388     void operator()(arg_type& a, arg_type& b) const
02389     {
02390         arg_type t = a;
02391         a = vminq_u16(a, b);
02392         b = vmaxq_u16(b, t);
02393     }
02394 };
02395 
02396 
02397 struct MinMaxVec16s
02398 {
02399     typedef short value_type;
02400     typedef int16x8_t arg_type;
02401     enum { SIZE = 8 };
02402     arg_type load(const short* ptr) { return vld1q_s16(ptr); }
02403     void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); }
02404     void operator()(arg_type& a, arg_type& b) const
02405     {
02406         arg_type t = a;
02407         a = vminq_s16(a, b);
02408         b = vmaxq_s16(b, t);
02409     }
02410 };
02411 
02412 
02413 struct MinMaxVec32f
02414 {
02415     typedef float value_type;
02416     typedef float32x4_t arg_type;
02417     enum { SIZE = 4 };
02418     arg_type load(const float* ptr) { return vld1q_f32(ptr); }
02419     void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); }
02420     void operator()(arg_type& a, arg_type& b) const
02421     {
02422         arg_type t = a;
02423         a = vminq_f32(a, b);
02424         b = vmaxq_f32(b, t);
02425     }
02426 };
02427 
02428 
02429 #else
02430 
02431 typedef MinMax8u MinMaxVec8u;
02432 typedef MinMax16u MinMaxVec16u;
02433 typedef MinMax16s MinMaxVec16s;
02434 typedef MinMax32f MinMaxVec32f;
02435 
02436 #endif
02437 
02438 template<class Op, class VecOp>
02439 static void
02440 medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
02441 {
02442     typedef typename Op::value_type T;
02443     typedef typename Op::arg_type WT;
02444     typedef typename VecOp::arg_type VT;
02445 
02446     const T* src = _src.ptr<T>();
02447     T* dst = _dst.ptr<T>();
02448     int sstep = (int)(_src.step/sizeof(T));
02449     int dstep = (int)(_dst.step/sizeof(T));
02450     Size size = _dst.size();
02451     int i, j, k, cn = _src.channels();
02452     Op op;
02453     VecOp vop;
02454     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
02455 
02456     if( m == 3 )
02457     {
02458         if( size.width == 1 || size.height == 1 )
02459         {
02460             int len = size.width + size.height - 1;
02461             int sdelta = size.height == 1 ? cn : sstep;
02462             int sdelta0 = size.height == 1 ? 0 : sstep - cn;
02463             int ddelta = size.height == 1 ? cn : dstep;
02464 
02465             for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
02466                 for( j = 0; j < cn; j++, src++ )
02467                 {
02468                     WT p0 = src[i > 0 ? -sdelta : 0];
02469                     WT p1 = src[0];
02470                     WT p2 = src[i < len - 1 ? sdelta : 0];
02471 
02472                     op(p0, p1); op(p1, p2); op(p0, p1);
02473                     dst[j] = (T)p1;
02474                 }
02475             return;
02476         }
02477 
02478         size.width *= cn;
02479         for( i = 0; i < size.height; i++, dst += dstep )
02480         {
02481             const T* row0 = src + std::max(i - 1, 0)*sstep;
02482             const T* row1 = src + i*sstep;
02483             const T* row2 = src + std::min(i + 1, size.height-1)*sstep;
02484             int limit = useSIMD ? cn : size.width;
02485 
02486             for(j = 0;; )
02487             {
02488                 for( ; j < limit; j++ )
02489                 {
02490                     int j0 = j >= cn ? j - cn : j;
02491                     int j2 = j < size.width - cn ? j + cn : j;
02492                     WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2];
02493                     WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2];
02494                     WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2];
02495 
02496                     op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
02497                     op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
02498                     op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
02499                     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
02500                     op(p4, p2); op(p6, p4); op(p4, p2);
02501                     dst[j] = (T)p4;
02502                 }
02503 
02504                 if( limit == size.width )
02505                     break;
02506 
02507                 for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE )
02508                 {
02509                     VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
02510                     VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
02511                     VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn);
02512 
02513                     vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
02514                     vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
02515                     vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
02516                     vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
02517                     vop(p4, p2); vop(p6, p4); vop(p4, p2);
02518                     vop.store(dst+j, p4);
02519                 }
02520 
02521                 limit = size.width;
02522             }
02523         }
02524     }
02525     else if( m == 5 )
02526     {
02527         if( size.width == 1 || size.height == 1 )
02528         {
02529             int len = size.width + size.height - 1;
02530             int sdelta = size.height == 1 ? cn : sstep;
02531             int sdelta0 = size.height == 1 ? 0 : sstep - cn;
02532             int ddelta = size.height == 1 ? cn : dstep;
02533 
02534             for( i = 0; i < len; i++, src += sdelta0, dst += ddelta )
02535                 for( j = 0; j < cn; j++, src++ )
02536                 {
02537                     int i1 = i > 0 ? -sdelta : 0;
02538                     int i0 = i > 1 ? -sdelta*2 : i1;
02539                     int i3 = i < len-1 ? sdelta : 0;
02540                     int i4 = i < len-2 ? sdelta*2 : i3;
02541                     WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4];
02542 
02543                     op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2);
02544                     op(p2, p4); op(p1, p3); op(p1, p2);
02545                     dst[j] = (T)p2;
02546                 }
02547             return;
02548         }
02549 
02550         size.width *= cn;
02551         for( i = 0; i < size.height; i++, dst += dstep )
02552         {
02553             const T* row[5];
02554             row[0] = src + std::max(i - 2, 0)*sstep;
02555             row[1] = src + std::max(i - 1, 0)*sstep;
02556             row[2] = src + i*sstep;
02557             row[3] = src + std::min(i + 1, size.height-1)*sstep;
02558             row[4] = src + std::min(i + 2, size.height-1)*sstep;
02559             int limit = useSIMD ? cn*2 : size.width;
02560 
02561             for(j = 0;; )
02562             {
02563                 for( ; j < limit; j++ )
02564                 {
02565                     WT p[25];
02566                     int j1 = j >= cn ? j - cn : j;
02567                     int j0 = j >= cn*2 ? j - cn*2 : j1;
02568                     int j3 = j < size.width - cn ? j + cn : j;
02569                     int j4 = j < size.width - cn*2 ? j + cn*2 : j3;
02570                     for( k = 0; k < 5; k++ )
02571                     {
02572                         const T* rowk = row[k];
02573                         p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1];
02574                         p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3];
02575                         p[k*5+4] = rowk[j4];
02576                     }
02577 
02578                     op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]);
02579                     op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]);
02580                     op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]);
02581                     op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]);
02582                     op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]);
02583                     op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]);
02584                     op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]);
02585                     op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]);
02586                     op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]);
02587                     op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]);
02588                     op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]);
02589                     op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]);
02590                     op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]);
02591                     op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]);
02592                     op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]);
02593                     op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]);
02594                     op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]);
02595                     op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]);
02596                     op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]);
02597                     op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]);
02598                     op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]);
02599                     op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]);
02600                     op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]);
02601                     dst[j] = (T)p[12];
02602                 }
02603 
02604                 if( limit == size.width )
02605                     break;
02606 
02607                 for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE )
02608                 {
02609                     VT p[25];
02610                     for( k = 0; k < 5; k++ )
02611                     {
02612                         const T* rowk = row[k];
02613                         p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn);
02614                         p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn);
02615                         p[k*5+4] = vop.load(rowk+j+cn*2);
02616                     }
02617 
02618                     vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]);
02619                     vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]);
02620                     vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]);
02621                     vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]);
02622                     vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]);
02623                     vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]);
02624                     vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]);
02625                     vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]);
02626                     vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]);
02627                     vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]);
02628                     vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]);
02629                     vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]);
02630                     vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]);
02631                     vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]);
02632                     vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]);
02633                     vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]);
02634                     vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]);
02635                     vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]);
02636                     vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]);
02637                     vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]);
02638                     vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]);
02639                     vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]);
02640                     vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]);
02641                     vop.store(dst+j, p[12]);
02642                 }
02643 
02644                 limit = size.width;
02645             }
02646         }
02647     }
02648 }
02649 
02650 #ifdef HAVE_OPENCL
02651 
02652 static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m)
02653 {
02654     size_t localsize[2] = { 16, 16 };
02655     size_t globalsize[2];
02656     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
02657 
02658     if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) )
02659         return false;
02660 
02661     Size imgSize = _src.size();
02662     bool useOptimized = (1 == cn) &&
02663                         (size_t)imgSize.width >= localsize[0] * 8  &&
02664                         (size_t)imgSize.height >= localsize[1] * 8 &&
02665                         imgSize.width % 4 == 0 &&
02666                         imgSize.height % 4 == 0 &&
02667                         (ocl::Device::getDefault().isIntel());
02668 
02669     cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ;
02670     cv::String kdefs = useOptimized ?
02671                          format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type),
02672                          ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn)
02673                          :
02674                          format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ;
02675 
02676     ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() );
02677 
02678     if (k.empty())
02679         return false;
02680 
02681     UMat src = _src.getUMat();
02682     _dst.create(src.size(), type);
02683     UMat dst = _dst.getUMat();
02684 
02685     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
02686 
02687     if( useOptimized )
02688     {
02689         globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0];
02690         globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1];
02691     }
02692     else
02693     {
02694         globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0];
02695         globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1];
02696     }
02697 
02698     return k.run(2, globalsize, localsize, false);
02699 }
02700 
02701 #endif
02702 
02703 }
02704 
02705 #ifdef HAVE_IPP
02706 namespace cv
02707 {
02708 static bool ipp_medianFilter( InputArray _src0, OutputArray _dst, int ksize )
02709 {
02710 #if IPP_VERSION_X100 >= 810
02711     Mat src0 = _src0.getMat();
02712     _dst.create( src0.size(), src0.type() );
02713     Mat dst = _dst.getMat();
02714 
02715 #define IPP_FILTER_MEDIAN_BORDER(ippType, ippDataType, flavor) \
02716     do \
02717     { \
02718         if (ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, \
02719         ippDataType, CV_MAT_CN(type), &bufSize) >= 0) \
02720         { \
02721             Ipp8u * buffer = ippsMalloc_8u(bufSize); \
02722             IppStatus status = ippiFilterMedianBorder_##flavor(src.ptr<ippType>(), (int)src.step, \
02723             dst.ptr<ippType>(), (int)dst.step, dstRoiSize, maskSize, \
02724             ippBorderRepl, (ippType)0, buffer); \
02725             ippsFree(buffer); \
02726             if (status >= 0) \
02727             { \
02728                 CV_IMPL_ADD(CV_IMPL_IPP); \
02729                 return true; \
02730             } \
02731         } \
02732     } \
02733     while ((void)0, 0)
02734 
02735     if( ksize <= 5 )
02736     {
02737         Ipp32s bufSize;
02738         IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize);
02739         Mat src;
02740         if( dst.data != src0.data )
02741             src = src0;
02742         else
02743             src0.copyTo(src);
02744 
02745         int type = src0.type();
02746         if (type == CV_8UC1)
02747             IPP_FILTER_MEDIAN_BORDER(Ipp8u, ipp8u, 8u_C1R);
02748         else if (type == CV_16UC1)
02749             IPP_FILTER_MEDIAN_BORDER(Ipp16u, ipp16u, 16u_C1R);
02750         else if (type == CV_16SC1)
02751             IPP_FILTER_MEDIAN_BORDER(Ipp16s, ipp16s, 16s_C1R);
02752         else if (type == CV_32FC1)
02753             IPP_FILTER_MEDIAN_BORDER(Ipp32f, ipp32f, 32f_C1R);
02754     }
02755 #undef IPP_FILTER_MEDIAN_BORDER
02756 #else
02757     CV_UNUSED(_src0); CV_UNUSED(_dst); CV_UNUSED(ksize);
02758 #endif
02759     return false;
02760 }
02761 }
02762 #endif
02763 
02764 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
02765 {
02766     CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 ));
02767 
02768     if( ksize <= 1 )
02769     {
02770         _src0.copyTo(_dst);
02771         return;
02772     }
02773 
02774 #ifdef HAVE_OPENCL
02775     CV_OCL_RUN(_dst.isUMat(),
02776                ocl_medianFilter(_src0,_dst, ksize))
02777 #endif
02778 
02779     Mat src0 = _src0.getMat();
02780     _dst.create( src0.size(), src0.type() );
02781     Mat dst = _dst.getMat();
02782 
02783     CV_IPP_RUN(IPP_VERSION_X100 >= 810 && ksize <= 5, ipp_medianFilter(_src0,_dst, ksize));
02784 
02785 #ifdef HAVE_TEGRA_OPTIMIZATION
02786     if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize))
02787         return;
02788 #endif
02789 
02790     bool useSortNet = ksize == 3 || (ksize == 5
02791 #if !(CV_SSE2 || CV_NEON)
02792             && src0.depth() > CV_8U
02793 #endif
02794         );
02795 
02796     Mat src;
02797     if( useSortNet )
02798     {
02799         if( dst.data != src0.data )
02800             src = src0;
02801         else
02802             src0.copyTo(src);
02803 
02804         if( src.depth() == CV_8U )
02805             medianBlur_SortNet<MinMax8u, MinMaxVec8u>( src, dst, ksize );
02806         else if( src.depth() == CV_16U )
02807             medianBlur_SortNet<MinMax16u, MinMaxVec16u>( src, dst, ksize );
02808         else if( src.depth() == CV_16S )
02809             medianBlur_SortNet<MinMax16s, MinMaxVec16s>( src, dst, ksize );
02810         else if( src.depth() == CV_32F )
02811             medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize );
02812         else
02813             CV_Error(CV_StsUnsupportedFormat, "");
02814 
02815         return;
02816     }
02817     else
02818     {
02819         cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE );
02820 
02821         int cn = src0.channels();
02822         CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) );
02823 
02824         double img_size_mp = (double)(src0.total())/(1 << 20);
02825         if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*
02826             (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3))
02827             medianBlur_8u_Om( src, dst, ksize );
02828         else
02829             medianBlur_8u_O1 ( src, dst, ksize );
02830     }
02831 }
02832 
02833 /****************************************************************************************\
02834                                    Bilateral Filtering
02835 \****************************************************************************************/
02836 
02837 namespace cv
02838 {
02839 
02840 class BilateralFilter_8u_Invoker :
02841     public ParallelLoopBody
02842 {
02843 public:
02844     BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
02845         int* _space_ofs, float *_space_weight, float *_color_weight) :
02846         temp(&_temp), dest(&_dest), radius(_radius),
02847         maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
02848     {
02849     }
02850 
02851     virtual void operator() (const Range& range) const
02852     {
02853         int i, j, cn = dest->channels(), k;
02854         Size size = dest->size();
02855         #if CV_SSE3
02856         int CV_DECL_ALIGNED(16) buf[4];
02857         float CV_DECL_ALIGNED(16) bufSum[4];
02858         static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
02859         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
02860         #endif
02861 
02862         for( i = range.start; i < range.end; i++ )
02863         {
02864             const uchar* sptr = temp->ptr(i+radius) + radius*cn;
02865             uchar* dptr = dest->ptr(i);
02866 
02867             if( cn == 1 )
02868             {
02869                 for( j = 0; j < size.width; j++ )
02870                 {
02871                     float sum = 0, wsum = 0;
02872                     int val0 = sptr[j];
02873                     k = 0;
02874                     #if CV_SSE3
02875                     if( haveSSE3 )
02876                     {
02877                         __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
02878                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
02879 
02880                         for( ; k <= maxk - 4; k += 4 )
02881                         {
02882                             __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
02883                                                       sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
02884 
02885                             __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
02886                             _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
02887 
02888                             __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
02889                                                     color_weight[buf[1]],color_weight[buf[0]]);
02890                             __m128 _sw = _mm_loadu_ps(space_weight+k);
02891                             __m128 _w = _mm_mul_ps(_cw, _sw);
02892                              _cw = _mm_mul_ps(_w, _valF);
02893 
02894                              _sw = _mm_hadd_ps(_w, _cw);
02895                              _sw = _mm_hadd_ps(_sw, _sw);
02896                              _mm_storel_pi((__m64*)bufSum, _sw);
02897 
02898                              sum += bufSum[1];
02899                              wsum += bufSum[0];
02900                         }
02901                     }
02902                     #endif
02903                     for( ; k < maxk; k++ )
02904                     {
02905                         int val = sptr[j + space_ofs[k]];
02906                         float w = space_weight[k]*color_weight[std::abs(val - val0)];
02907                         sum += val*w;
02908                         wsum += w;
02909                     }
02910                     // overflow is not possible here => there is no need to use cv::saturate_cast
02911                     dptr[j] = (uchar)cvRound(sum/wsum);
02912                 }
02913             }
02914             else
02915             {
02916                 assert( cn == 3 );
02917                 for( j = 0; j < size.width*3; j += 3 )
02918                 {
02919                     float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
02920                     int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
02921                     k = 0;
02922                     #if CV_SSE3
02923                     if( haveSSE3 )
02924                     {
02925                         const __m128i izero = _mm_setzero_si128();
02926                         const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
02927                         const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
02928                         const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
02929                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
02930 
02931                         for( ; k <= maxk - 4; k += 4 )
02932                         {
02933                             const int* const sptr_k0  = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
02934                             const int* const sptr_k1  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
02935                             const int* const sptr_k2  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
02936                             const int* const sptr_k3  = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
02937 
02938                             __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
02939                             __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
02940                             __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
02941                             __m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
02942 
02943                             _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
02944 
02945                             __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
02946                             __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
02947                             __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
02948 
02949                             bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
02950                             _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
02951 
02952                             __m128 _w  = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
02953                                                     color_weight[buf[1]],color_weight[buf[0]]);
02954                             __m128 _sw = _mm_loadu_ps(space_weight+k);
02955 
02956                             _w = _mm_mul_ps(_w,_sw);
02957                             _b = _mm_mul_ps(_b, _w);
02958                             _g = _mm_mul_ps(_g, _w);
02959                             _r = _mm_mul_ps(_r, _w);
02960 
02961                              _w = _mm_hadd_ps(_w, _b);
02962                              _g = _mm_hadd_ps(_g, _r);
02963 
02964                              _w = _mm_hadd_ps(_w, _g);
02965                              _mm_store_ps(bufSum, _w);
02966 
02967                              wsum  += bufSum[0];
02968                              sum_b += bufSum[1];
02969                              sum_g += bufSum[2];
02970                              sum_r += bufSum[3];
02971                          }
02972                     }
02973                     #endif
02974 
02975                     for( ; k < maxk; k++ )
02976                     {
02977                         const uchar* sptr_k = sptr + j + space_ofs[k];
02978                         int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
02979                         float w = space_weight[k]*color_weight[std::abs(b - b0) +
02980                                                                std::abs(g - g0) + std::abs(r - r0)];
02981                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
02982                         wsum += w;
02983                     }
02984                     wsum = 1.f/wsum;
02985                     b0 = cvRound(sum_b*wsum);
02986                     g0 = cvRound(sum_g*wsum);
02987                     r0 = cvRound(sum_r*wsum);
02988                     dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
02989                 }
02990             }
02991         }
02992     }
02993 
02994 private:
02995     const Mat *temp;
02996     Mat *dest;
02997     int radius, maxk, *space_ofs;
02998     float *space_weight, *color_weight;
02999 };
03000 
03001 #if defined (HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) && IPP_DISABLE_BLOCK
03002 class IPPBilateralFilter_8u_Invoker :
03003     public ParallelLoopBody
03004 {
03005 public:
03006     IPPBilateralFilter_8u_Invoker(Mat &_src, Mat &_dst, double _sigma_color, double _sigma_space, int _radius, bool *_ok) :
03007       ParallelLoopBody(), src(_src), dst(_dst), sigma_color(_sigma_color), sigma_space(_sigma_space), radius(_radius), ok(_ok)
03008       {
03009           *ok = true;
03010       }
03011 
03012       virtual void operator() (const Range& range) const
03013       {
03014           int d = radius * 2 + 1;
03015           IppiSize kernel = {d, d};
03016           IppiSize roi={dst.cols, range.end - range.start};
03017           int bufsize=0;
03018           if (0 > ippiFilterBilateralGetBufSize_8u_C1R( ippiFilterBilateralGauss, roi, kernel, &bufsize))
03019           {
03020               *ok = false;
03021               return;
03022           }
03023           AutoBuffer<uchar> buf(bufsize);
03024           IppiFilterBilateralSpec *pSpec = (IppiFilterBilateralSpec *)alignPtr(&buf[0], 32);
03025           if (0 > ippiFilterBilateralInit_8u_C1R( ippiFilterBilateralGauss, kernel, (Ipp32f)sigma_color, (Ipp32f)sigma_space, 1, pSpec ))
03026           {
03027               *ok = false;
03028               return;
03029           }
03030           if (0 > ippiFilterBilateral_8u_C1R( src.ptr<uchar>(range.start) + radius * ((int)src.step[0] + 1), (int)src.step[0], dst.ptr<uchar>(range.start), (int)dst.step[0], roi, kernel, pSpec ))
03031               *ok = false;
03032           else
03033           {
03034             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
03035           }
03036       }
03037 private:
03038     Mat &src;
03039     Mat &dst;
03040     double sigma_color;
03041     double sigma_space;
03042     int radius;
03043     bool *ok;
03044     const IPPBilateralFilter_8u_Invoker& operator= (const IPPBilateralFilter_8u_Invoker&);
03045 };
03046 #endif
03047 
03048 #ifdef HAVE_OPENCL
03049 
03050 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
03051                                    double sigma_color, double sigma_space,
03052                                    int borderType)
03053 {
03054 #ifdef ANDROID
03055     if (ocl::Device::getDefault().isNVidia())
03056         return false;
03057 #endif
03058 
03059     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
03060     int i, j, maxk, radius;
03061 
03062     if (depth != CV_8U || cn > 4)
03063         return false;
03064 
03065     if (sigma_color <= 0)
03066         sigma_color = 1;
03067     if (sigma_space <= 0)
03068         sigma_space = 1;
03069 
03070     double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
03071     double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
03072 
03073     if ( d <= 0 )
03074         radius = cvRound(sigma_space * 1.5);
03075     else
03076         radius = d / 2;
03077     radius = MAX(radius, 1);
03078     d = radius * 2 + 1;
03079 
03080     UMat src = _src.getUMat(), dst = _dst.getUMat(), temp;
03081     if (src.u == dst.u)
03082         return false;
03083 
03084     copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);
03085     std::vector<float> _space_weight(d * d);
03086     std::vector<int> _space_ofs(d * d);
03087     float * const space_weight = &_space_weight[0];
03088     int * const space_ofs = &_space_ofs[0];
03089 
03090     // initialize space-related bilateral filter coefficients
03091     for( i = -radius, maxk = 0; i <= radius; i++ )
03092         for( j = -radius; j <= radius; j++ )
03093         {
03094             double r = std::sqrt((double)i * i + (double)j * j);
03095             if ( r > radius )
03096                 continue;
03097             space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
03098             space_ofs[maxk++] = (int)(i * temp.step + j * cn);
03099         }
03100 
03101     char cvt[3][40];
03102     String cnstr = cn > 1 ? format("%d", cn) : "";
03103     String kernelName("bilateral");
03104     size_t sizeDiv = 1;
03105     if ((ocl::Device::getDefault().isIntel()) &&
03106         (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
03107     {
03108             //Intel GPU
03109             if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images.
03110             {
03111                 kernelName = "bilateral_float4";
03112                 sizeDiv = 4;
03113             }
03114      }
03115      ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
03116             format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
03117             " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f",
03118             radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
03119             ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
03120             ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
03121             ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
03122             ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff));
03123     if (k.empty())
03124         return false;
03125 
03126     Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
03127     Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
03128     UMat ucolor_weight, uspace_weight, uspace_ofs;
03129 
03130     mspace_weight.copyTo(uspace_weight);
03131     mspace_ofs.copyTo(uspace_ofs);
03132 
03133     k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
03134            ocl::KernelArg::PtrReadOnly(uspace_weight),
03135            ocl::KernelArg::PtrReadOnly(uspace_ofs));
03136 
03137     size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows };
03138     return k.run(2, globalsize, NULL, false);
03139 }
03140 
03141 #endif
03142 static void
03143 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
03144     double sigma_color, double sigma_space,
03145     int borderType )
03146 {
03147     int cn = src.channels();
03148     int i, j, maxk, radius;
03149     Size size = src.size();
03150 
03151     CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
03152 
03153     if( sigma_color <= 0 )
03154         sigma_color = 1;
03155     if( sigma_space <= 0 )
03156         sigma_space = 1;
03157 
03158     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
03159     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
03160 
03161     if( d <= 0 )
03162         radius = cvRound(sigma_space*1.5);
03163     else
03164         radius = d/2;
03165     radius = MAX(radius, 1);
03166     d = radius*2 + 1;
03167 
03168     Mat temp;
03169     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
03170 
03171 #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) && IPP_DISABLE_BLOCK
03172     CV_IPP_CHECK()
03173     {
03174         if( cn == 1 )
03175         {
03176             bool ok;
03177             IPPBilateralFilter_8u_Invoker body(temp, dst, sigma_color * sigma_color, sigma_space * sigma_space, radius, &ok );
03178             parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16));
03179             if( ok )
03180             {
03181                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
03182                 return;
03183             }
03184             setIppErrorStatus();
03185         }
03186     }
03187 #endif
03188 
03189     std::vector<float> _color_weight(cn*256);
03190     std::vector<float> _space_weight(d*d);
03191     std::vector<int> _space_ofs(d*d);
03192     float* color_weight = &_color_weight[0];
03193     float* space_weight = &_space_weight[0];
03194     int* space_ofs = &_space_ofs[0];
03195 
03196     // initialize color-related bilateral filter coefficients
03197 
03198     for( i = 0; i < 256*cn; i++ )
03199         color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
03200 
03201     // initialize space-related bilateral filter coefficients
03202     for( i = -radius, maxk = 0; i <= radius; i++ )
03203     {
03204         j = -radius;
03205 
03206         for( ; j <= radius; j++ )
03207         {
03208             double r = std::sqrt((double)i*i + (double)j*j);
03209             if( r > radius )
03210                 continue;
03211             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
03212             space_ofs[maxk++] = (int)(i*temp.step + j*cn);
03213         }
03214     }
03215 
03216     BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
03217     parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
03218 }
03219 
03220 
03221 class BilateralFilter_32f_Invoker :
03222     public ParallelLoopBody
03223 {
03224 public:
03225 
03226     BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
03227         const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
03228         cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
03229         temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
03230     {
03231     }
03232 
03233     virtual void operator() (const Range& range) const
03234     {
03235         int i, j, k;
03236         Size size = dest->size();
03237         #if CV_SSE3
03238         int CV_DECL_ALIGNED(16) idxBuf[4];
03239         float CV_DECL_ALIGNED(16) bufSum32[4];
03240         static const unsigned int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
03241         bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
03242         #endif
03243 
03244         for( i = range.start; i < range.end; i++ )
03245         {
03246             const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
03247             float* dptr = dest->ptr<float>(i);
03248 
03249             if( cn == 1 )
03250             {
03251                 for( j = 0; j < size.width; j++ )
03252                 {
03253                     float sum = 0, wsum = 0;
03254                     float val0 = sptr[j];
03255                     k = 0;
03256                     #if CV_SSE3
03257                     if( haveSSE3 )
03258                     {
03259                         __m128 psum = _mm_setzero_ps();
03260                         const __m128 _val0 = _mm_set1_ps(sptr[j]);
03261                         const __m128 _scale_index = _mm_set1_ps(scale_index);
03262                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
03263 
03264                         for( ; k <= maxk - 4 ; k += 4 )
03265                         {
03266                             __m128 _sw    = _mm_loadu_ps(space_weight + k);
03267                             __m128 _val   = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
03268                                                        sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
03269                             __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
03270 
03271                             __m128i _idx = _mm_cvtps_epi32(_alpha);
03272                             _mm_store_si128((__m128i*)idxBuf, _idx);
03273                             _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
03274 
03275                             __m128 _explut  = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
03276                                                          expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
03277                             __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
03278                                                          expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
03279 
03280                             __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
03281                             _val = _mm_mul_ps(_w, _val);
03282 
03283                              _sw = _mm_hadd_ps(_w, _val);
03284                              _sw = _mm_hadd_ps(_sw, _sw);
03285                              psum = _mm_add_ps(_sw, psum);
03286                         }
03287                         _mm_storel_pi((__m64*)bufSum32, psum);
03288 
03289                         sum = bufSum32[1];
03290                         wsum = bufSum32[0];
03291                     }
03292                     #endif
03293 
03294                     for( ; k < maxk; k++ )
03295                     {
03296                         float val = sptr[j + space_ofs[k]];
03297                         float alpha = (float)(std::abs(val - val0)*scale_index);
03298                         int idx = cvFloor(alpha);
03299                         alpha -= idx;
03300                         float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
03301                         sum += val*w;
03302                         wsum += w;
03303                     }
03304                     dptr[j] = (float)(sum/wsum);
03305                 }
03306             }
03307             else
03308             {
03309                 CV_Assert( cn == 3 );
03310                 for( j = 0; j < size.width*3; j += 3 )
03311                 {
03312                     float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
03313                     float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
03314                     k = 0;
03315                     #if  CV_SSE3
03316                     if( haveSSE3 )
03317                     {
03318                         __m128 sum = _mm_setzero_ps();
03319                         const __m128 _b0 = _mm_set1_ps(b0);
03320                         const __m128 _g0 = _mm_set1_ps(g0);
03321                         const __m128 _r0 = _mm_set1_ps(r0);
03322                         const __m128 _scale_index = _mm_set1_ps(scale_index);
03323                         const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
03324 
03325                         for( ; k <= maxk-4; k += 4 )
03326                         {
03327                             __m128 _sw = _mm_loadu_ps(space_weight + k);
03328 
03329                             const float* const sptr_k0 = sptr + j + space_ofs[k];
03330                             const float* const sptr_k1 = sptr + j + space_ofs[k+1];
03331                             const float* const sptr_k2 = sptr + j + space_ofs[k+2];
03332                             const float* const sptr_k3 = sptr + j + space_ofs[k+3];
03333 
03334                             __m128 _b = _mm_loadu_ps(sptr_k0);
03335                             __m128 _g = _mm_loadu_ps(sptr_k1);
03336                             __m128 _r = _mm_loadu_ps(sptr_k2);
03337                             __m128 _z = _mm_loadu_ps(sptr_k3);
03338                             _MM_TRANSPOSE4_PS(_b, _g, _r, _z);
03339 
03340                             __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
03341                             __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
03342                             __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
03343 
03344                             __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
03345 
03346                             __m128i _idx  = _mm_cvtps_epi32(_alpha);
03347                             _mm_store_si128((__m128i*)idxBuf, _idx);
03348                             _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
03349 
03350                             __m128 _explut  = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
03351                             __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
03352 
03353                             __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
03354 
03355                             _b = _mm_mul_ps(_b, _w);
03356                             _g = _mm_mul_ps(_g, _w);
03357                             _r = _mm_mul_ps(_r, _w);
03358 
03359                              _w = _mm_hadd_ps(_w, _b);
03360                              _g = _mm_hadd_ps(_g, _r);
03361 
03362                              _w = _mm_hadd_ps(_w, _g);
03363                              sum = _mm_add_ps(sum, _w);
03364                         }
03365                         _mm_store_ps(bufSum32, sum);
03366                         wsum  = bufSum32[0];
03367                         sum_b = bufSum32[1];
03368                         sum_g = bufSum32[2];
03369                         sum_r = bufSum32[3];
03370                     }
03371                     #endif
03372 
03373                     for(; k < maxk; k++ )
03374                     {
03375                         const float* sptr_k = sptr + j + space_ofs[k];
03376                         float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
03377                         float alpha = (float)((std::abs(b - b0) +
03378                             std::abs(g - g0) + std::abs(r - r0))*scale_index);
03379                         int idx = cvFloor(alpha);
03380                         alpha -= idx;
03381                         float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
03382                         sum_b += b*w; sum_g += g*w; sum_r += r*w;
03383                         wsum += w;
03384                     }
03385                     wsum = 1.f/wsum;
03386                     b0 = sum_b*wsum;
03387                     g0 = sum_g*wsum;
03388                     r0 = sum_r*wsum;
03389                     dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
03390                 }
03391             }
03392         }
03393     }
03394 
03395 private:
03396     int cn, radius, maxk, *space_ofs;
03397     const Mat* temp;
03398     Mat *dest;
03399     float scale_index, *space_weight, *expLUT;
03400 };
03401 
03402 
03403 static void
03404 bilateralFilter_32f( const Mat& src, Mat& dst, int d,
03405                      double sigma_color, double sigma_space,
03406                      int borderType )
03407 {
03408     int cn = src.channels();
03409     int i, j, maxk, radius;
03410     double minValSrc=-1, maxValSrc=1;
03411     const int kExpNumBinsPerChannel = 1 << 12;
03412     int kExpNumBins = 0;
03413     float lastExpVal = 1.f;
03414     float len, scale_index;
03415     Size size = src.size();
03416 
03417     CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
03418 
03419     if( sigma_color <= 0 )
03420         sigma_color = 1;
03421     if( sigma_space <= 0 )
03422         sigma_space = 1;
03423 
03424     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
03425     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
03426 
03427     if( d <= 0 )
03428         radius = cvRound(sigma_space*1.5);
03429     else
03430         radius = d/2;
03431     radius = MAX(radius, 1);
03432     d = radius*2 + 1;
03433     // compute the min/max range for the input image (even if multichannel)
03434 
03435     minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
03436     if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
03437     {
03438         src.copyTo(dst);
03439         return;
03440     }
03441 
03442     // temporary copy of the image with borders for easy processing
03443     Mat temp;
03444     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
03445     const double insteadNaNValue = -5. * sigma_color;
03446     patchNaNs( temp, insteadNaNValue ); // this replacement of NaNs makes the assumption that depth values are nonnegative
03447                                         // TODO: make insteadNaNValue avalible in the outside function interface to control the cases breaking the assumption
03448     // allocate lookup tables
03449     std::vector<float> _space_weight(d*d);
03450     std::vector<int> _space_ofs(d*d);
03451     float* space_weight = &_space_weight[0];
03452     int* space_ofs = &_space_ofs[0];
03453 
03454     // assign a length which is slightly more than needed
03455     len = (float)(maxValSrc - minValSrc) * cn;
03456     kExpNumBins = kExpNumBinsPerChannel * cn;
03457     std::vector<float> _expLUT(kExpNumBins+2);
03458     float* expLUT = &_expLUT[0];
03459 
03460     scale_index = kExpNumBins/len;
03461 
03462     // initialize the exp LUT
03463     for( i = 0; i < kExpNumBins+2; i++ )
03464     {
03465         if( lastExpVal > 0.f )
03466         {
03467             double val =  i / scale_index;
03468             expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
03469             lastExpVal = expLUT[i];
03470         }
03471         else
03472             expLUT[i] = 0.f;
03473     }
03474 
03475     // initialize space-related bilateral filter coefficients
03476     for( i = -radius, maxk = 0; i <= radius; i++ )
03477         for( j = -radius; j <= radius; j++ )
03478         {
03479             double r = std::sqrt((double)i*i + (double)j*j);
03480             if( r > radius )
03481                 continue;
03482             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
03483             space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
03484         }
03485 
03486     // parallel_for usage
03487 
03488     BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
03489     parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
03490 }
03491 
03492 }
03493 
03494 void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
03495                       double sigmaColor, double sigmaSpace,
03496                       int borderType )
03497 {
03498     _dst.create( _src.size(), _src.type() );
03499 
03500 #ifdef HAVE_OPENCL
03501     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
03502                ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
03503 #endif
03504 
03505     Mat src = _src.getMat(), dst = _dst.getMat();
03506 
03507     if( src.depth() == CV_8U )
03508         bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType );
03509     else if( src.depth() == CV_32F )
03510         bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
03511     else
03512         CV_Error( CV_StsUnsupportedFormat,
03513         "Bilateral filtering is only implemented for 8u and 32f images" );
03514 }
03515 
03516 //////////////////////////////////////////////////////////////////////////////////////////
03517 
03518 CV_IMPL void
03519 cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
03520           int param1, int param2, double param3, double param4 )
03521 {
03522     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
03523 
03524     CV_Assert( dst.size() == src.size() &&
03525         (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) );
03526 
03527     if( param2 <= 0 )
03528         param2 = param1;
03529 
03530     if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE )
03531         cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point (-1,-1),
03532             smooth_type == CV_BLUR, cv::BORDER_REPLICATE );
03533     else if( smooth_type == CV_GAUSSIAN )
03534         cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE );
03535     else if( smooth_type == CV_MEDIAN )
03536         cv::medianBlur( src, dst, param1 );
03537     else
03538         cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE );
03539 
03540     if( dst.data != dst0.data )
03541         CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" );
03542 }
03543 
03544 /* End of file. */
03545
Repository toolbox

Repository details

Type:	Program
Created:	04 Jul 2017
Imports:	5
Forks:	1
Commits:	171
Dependents:	0
Dependencies:	0
Followers:	4
Important changes to repositories hosted on mbed.com

smooth.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

smooth.cpp

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning